Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 4 | use lib 'lib', '../lib'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | use Getopt::Long; |
| 6 | use Benchmark qw/:hireswallclock/; |
| 7 | use IO::Compress::Gzip qw/$GzipError/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use Log::Log4perl; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 10 | use KorAP::XML::Krill; |
| 11 | use KorAP::XML::Tokenizer; |
| 12 | |
| 13 | our $VERSION = 0.04; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 14 | |
| 15 | # Merges foundry data to create indexer friendly documents |
Nils Diewald | 32e30f0 | 2014-10-30 00:52:36 +0000 | [diff] [blame] | 16 | # ndiewald, 2014/10/29 |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 17 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 18 | # 2016/02/04 |
| 19 | # - renamed to korapxml2krill |
| 20 | # - added Schreibgebrauch support |
| 21 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 22 | sub printhelp { |
| 23 | print <<'EOHELP'; |
| 24 | |
| 25 | Merge foundry data based on a tokenization and create indexer friendly documents. |
| 26 | |
| 27 | Call: |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 28 | korapxml2krill -z --input <directory> --output <filename> |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 29 | |
| 30 | --input|-i <directory> Directory of the document to index |
| 31 | --output|-o <filename> Document name for output (optional), |
| 32 | Writes to <STDOUT> by default |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 33 | --overwrite|-w Overwrite files that already exist |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 34 | --token|-t <foundry>[#<layer>] Define the default tokenization by specifying |
| 35 | the name of the foundry and optionally the name |
| 36 | of the layer. Defaults to OpenNLP#tokens. |
| 37 | --skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name |
| 38 | or specific layers by defining the name |
| 39 | with a # in front of the foundry, |
| 40 | e.g. Mate#Morpho. Alternatively you can skip #ALL. |
| 41 | Can be set multiple times. |
| 42 | --allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them |
| 43 | combining the foundry name with a # and the layer name. |
| 44 | --primary|-p Output primary data or not. Defaults to true. |
| 45 | Can be flagged using --no-primary as well. |
| 46 | --human|-m Represent the data human friendly, |
| 47 | while the output defaults to JSON |
| 48 | --pretty|-y Pretty print json output |
| 49 | --gzip|-z Compress the output |
| 50 | (expects a defined output file) |
| 51 | --log|-l The Log4perl log level, defaults to ERROR. |
| 52 | --help|-h Print this document (optional) |
| 53 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 54 | diewald@ids-mannheim.de, 2016/02/04 |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 55 | |
| 56 | EOHELP |
| 57 | exit(defined $_[0] ? $_[0] : 0); |
| 58 | }; |
| 59 | |
| 60 | # Options from the command line |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 61 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, |
| 62 | $primary, @allow, $pretty, $overwrite); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 63 | GetOptions( |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 64 | 'input|i=s' => \$input, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 65 | 'output|o=s' => \$output, |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 66 | 'overwrite|w' => \$overwrite, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 67 | 'human|m' => \$text, |
| 68 | 'token|t=s' => \$token_base, |
| 69 | 'gzip|z' => \$gzip, |
| 70 | 'skip|s=s' => \@skip, |
| 71 | 'log|l=s' => \$log_level, |
| 72 | 'allow|a=s' => \@allow, |
| 73 | 'primary|p!' => \$primary, |
| 74 | 'pretty|y' => \$pretty, |
| 75 | 'help|h' => sub { printhelp } |
| 76 | ); |
| 77 | |
| 78 | printhelp(1) if !$input || ($gzip && !$output); |
| 79 | |
| 80 | $log_level //= 'ERROR'; |
| 81 | |
| 82 | my %skip; |
| 83 | $skip{lc($_)} = 1 foreach @skip; |
| 84 | |
| 85 | Log::Log4perl->init({ |
| 86 | 'log4perl.rootLogger' => uc($log_level) . ', STDERR', |
| 87 | 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', |
| 88 | 'log4perl.appender.STDERR.layout' => 'PatternLayout', |
| 89 | 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' |
| 90 | }); |
| 91 | |
| 92 | my $log = Log::Log4perl->get_logger('main'); |
| 93 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 94 | # Ignore processing |
| 95 | if (!$overwrite && $output && -e $output) { |
| 96 | $log->trace($output . ' already exists'); |
| 97 | exit(0); |
| 98 | }; |
| 99 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 100 | BEGIN { |
| 101 | $main::TIME = Benchmark->new; |
| 102 | $main::LAST_STOP = Benchmark->new; |
| 103 | }; |
| 104 | |
| 105 | sub stop_time { |
| 106 | my $new = Benchmark->new; |
| 107 | $log->trace( |
| 108 | 'The code took: '. |
| 109 | timestr(timediff($new, $main::LAST_STOP)) . |
| 110 | ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' |
| 111 | ); |
| 112 | $main::LAST_STOP = $new; |
| 113 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 114 | |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 115 | # Call perl script/korapxml2krill WPD/AAA/00001 |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 116 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 117 | # Create and parse new document |
| 118 | $input =~ s{([^/])$}{$1/}; |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 119 | my $doc = KorAP::XML::Krill->new( path => $input ); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 120 | |
| 121 | unless ($doc->parse) { |
Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 122 | $log->warn($output . " can't be processed - no document data"); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 123 | exit(0); |
| 124 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 125 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 126 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); |
| 127 | if ($token_base) { |
| 128 | ($token_base_foundry, $token_base_layer) = split /#/, $token_base; |
| 129 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 130 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 131 | # Get tokenization |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 132 | my $tokens = KorAP::XML::Tokenizer->new( |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 133 | path => $doc->path, |
| 134 | doc => $doc, |
| 135 | foundry => $token_base_foundry, |
| 136 | layer => $token_base_layer, |
| 137 | name => 'tokens' |
| 138 | ); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 139 | |
| 140 | # Unable to process base tokenization |
| 141 | unless ($tokens->parse) { |
Nils Diewald | 93a01db | 2014-11-05 18:22:17 +0000 | [diff] [blame] | 142 | $log->error($output . " can't be processed - no base tokenization"); |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 143 | exit(0); |
| 144 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 145 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 146 | my @layers; |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 147 | push(@layers, ['Base', 'Sentences']); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 148 | push(@layers, ['Base', 'Paragraphs']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 149 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 150 | # Connexor |
| 151 | push(@layers, ['Connexor', 'Morpho']); |
| 152 | push(@layers, ['Connexor', 'Syntax']); |
| 153 | push(@layers, ['Connexor', 'Phrase']); |
| 154 | push(@layers, ['Connexor', 'Sentences']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 155 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 156 | # CoreNLP |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 157 | push(@layers, ['CoreNLP', 'NamedEntities']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 158 | push(@layers, ['CoreNLP', 'Sentences']); |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 159 | push(@layers, ['CoreNLP', 'Morpho']); |
| 160 | push(@layers, ['CoreNLP', 'Constituency']); |
| 161 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 162 | # DeReKo |
| 163 | push(@layers, ['DeReKo', 'Structure']); |
| 164 | |
Nils Diewald | 02d100e | 2014-10-31 17:51:19 +0000 | [diff] [blame] | 165 | # Glemm |
| 166 | push(@layers, ['Glemm', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 167 | |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 168 | # Malt |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 169 | # push(@layers, ['Malt', 'Dependency']); |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 170 | |
| 171 | # Mate |
| 172 | push(@layers, ['Mate', 'Morpho']); |
| 173 | push(@layers, ['Mate', 'Dependency']); |
| 174 | |
| 175 | # OpenNLP |
| 176 | push(@layers, ['OpenNLP', 'Morpho']); |
| 177 | push(@layers, ['OpenNLP', 'Sentences']); |
| 178 | |
| 179 | # Schreibgebrauch |
Akron | 93d620e | 2016-02-05 19:40:05 +0100 | [diff] [blame^] | 180 | push(@layers, ['Sgbr', 'Lemma']); |
| 181 | push(@layers, ['Sgbr', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 182 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 183 | # TreeTagger |
| 184 | push(@layers, ['TreeTagger', 'Morpho']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 185 | push(@layers, ['TreeTagger', 'Sentences']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 186 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 187 | # XIP |
| 188 | push(@layers, ['XIP', 'Morpho']); |
| 189 | push(@layers, ['XIP', 'Constituency']); |
Nils Diewald | 7b84722 | 2014-04-23 11:14:00 +0000 | [diff] [blame] | 190 | push(@layers, ['XIP', 'Sentences']); |
Akron | 14ca9f0 | 2016-01-29 19:38:18 +0100 | [diff] [blame] | 191 | push(@layers, ['XIP', 'Dependency']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 192 | |
| 193 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 194 | if ($skip{'#all'}) { |
| 195 | foreach (@allow) { |
| 196 | $tokens->add(split('#', $_)); |
| 197 | stop_time; |
| 198 | }; |
| 199 | } |
| 200 | else { |
| 201 | # Add to index file - respect skipping |
| 202 | foreach my $info (@layers) { |
| 203 | unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) { |
| 204 | $tokens->add(@$info); |
| 205 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 206 | }; |
| 207 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 208 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 209 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 210 | my $file; |
| 211 | |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 212 | my $print_text = $text ? $tokens->to_string($primary) : |
| 213 | ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 214 | |
| 215 | if ($output) { |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 216 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 217 | if ($gzip) { |
| 218 | $file = IO::Compress::Gzip->new($output, Minimal => 1); |
| 219 | } |
| 220 | else { |
| 221 | $file = IO::File->new($output, "w"); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 222 | }; |
| 223 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 224 | $file->print($print_text); |
| 225 | $file->close; |
| 226 | } |
Nils Diewald | 59094f2 | 2014-11-05 18:20:50 +0000 | [diff] [blame] | 227 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 228 | else { |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 229 | print $print_text . "\n"; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 230 | }; |
| 231 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 232 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 233 | |
| 234 | __END__ |