Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 4 | use lib 'lib', '../lib'; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 5 | use Getopt::Long; |
| 6 | use Benchmark qw/:hireswallclock/; |
| 7 | use IO::Compress::Gzip qw/$GzipError/; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 8 | use Log::Log4perl; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 9 | use KorAP::Document; |
| 10 | use KorAP::Tokenizer; |
| 11 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 12 | our $VERSION = 0.01; |
| 13 | |
| 14 | # Merges foundry data to create indexer friendly documents |
| 15 | # ndiewald, 2013/11/05 |
| 16 | |
| 17 | sub printhelp { |
| 18 | print <<'EOHELP'; |
| 19 | |
| 20 | Merge foundry data based on a tokenization and create indexer friendly documents. |
| 21 | |
| 22 | Call: |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 23 | prepare_index.pl -z --input <directory> --output <filename> |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 24 | |
| 25 | --input|-i <directory> Directory of the document to index |
| 26 | --output|-o <filename> Document name for output (optional), |
| 27 | Writes to <STDOUT> by default |
| 28 | --token|-t <foundry>[#<layer>] Define the default tokenization by specifying |
| 29 | the name of the foundry and optionally the name |
| 30 | of the layer. Defaults to OpenNLP#tokens. |
| 31 | --skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name |
| 32 | or specific layers by defining the name |
| 33 | with a # in front of the foundry, |
| 34 | e.g. Mate#Morpho. Alternatively you can skip #ALL. |
| 35 | Can be set multiple times. |
| 36 | --allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them |
| 37 | combining the foundry name with a # and the layer name. |
| 38 | --primary|-p Output primary data or not. Defaults to true. |
| 39 | Can be flagged using --no-primary as well. |
| 40 | --human|-m Represent the data human friendly, |
| 41 | while the output defaults to JSON |
| 42 | --pretty|-y Pretty print json output |
| 43 | --gzip|-z Compress the output |
| 44 | (expects a defined output file) |
| 45 | --log|-l The Log4perl log level, defaults to ERROR. |
| 46 | --help|-h Print this document (optional) |
| 47 | |
| 48 | diewald@ids-mannheim.de, 2013/11/04 |
| 49 | |
| 50 | EOHELP |
| 51 | exit(defined $_[0] ? $_[0] : 0); |
| 52 | }; |
| 53 | |
| 54 | # Options from the command line |
| 55 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty); |
| 56 | GetOptions( |
Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 57 | 'input|i=s' => \$input, |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 58 | 'output|o=s' => \$output, |
| 59 | 'human|m' => \$text, |
| 60 | 'token|t=s' => \$token_base, |
| 61 | 'gzip|z' => \$gzip, |
| 62 | 'skip|s=s' => \@skip, |
| 63 | 'log|l=s' => \$log_level, |
| 64 | 'allow|a=s' => \@allow, |
| 65 | 'primary|p!' => \$primary, |
| 66 | 'pretty|y' => \$pretty, |
| 67 | 'help|h' => sub { printhelp } |
| 68 | ); |
| 69 | |
| 70 | printhelp(1) if !$input || ($gzip && !$output); |
| 71 | |
| 72 | $log_level //= 'ERROR'; |
| 73 | |
| 74 | my %skip; |
| 75 | $skip{lc($_)} = 1 foreach @skip; |
| 76 | |
| 77 | Log::Log4perl->init({ |
| 78 | 'log4perl.rootLogger' => uc($log_level) . ', STDERR', |
| 79 | 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels', |
| 80 | 'log4perl.appender.STDERR.layout' => 'PatternLayout', |
| 81 | 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n' |
| 82 | }); |
| 83 | |
| 84 | my $log = Log::Log4perl->get_logger('main'); |
| 85 | |
| 86 | BEGIN { |
| 87 | $main::TIME = Benchmark->new; |
| 88 | $main::LAST_STOP = Benchmark->new; |
| 89 | }; |
| 90 | |
| 91 | sub stop_time { |
| 92 | my $new = Benchmark->new; |
| 93 | $log->trace( |
| 94 | 'The code took: '. |
| 95 | timestr(timediff($new, $main::LAST_STOP)) . |
| 96 | ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')' |
| 97 | ); |
| 98 | $main::LAST_STOP = $new; |
| 99 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 100 | |
| 101 | # Call perl script/prepare_index.pl WPD/AAA/00001 |
| 102 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 103 | # Create and parse new document |
| 104 | $input =~ s{([^/])$}{$1/}; |
| 105 | my $doc = KorAP::Document->new( path => $input ); |
| 106 | $doc->parse; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 107 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 108 | my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/); |
| 109 | if ($token_base) { |
| 110 | ($token_base_foundry, $token_base_layer) = split /#/, $token_base; |
| 111 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 112 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 113 | # Get tokenization |
| 114 | my $tokens = KorAP::Tokenizer->new( |
| 115 | path => $doc->path, |
| 116 | doc => $doc, |
| 117 | foundry => $token_base_foundry, |
| 118 | layer => $token_base_layer, |
| 119 | name => 'tokens' |
| 120 | ); |
| 121 | $tokens->parse; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 122 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 123 | my @layers; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 124 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 125 | # Base information |
Nils Diewald | 37e5b57 | 2013-11-20 20:26:03 +0000 | [diff] [blame] | 126 | push(@layers, ['Base', 'Sentences']); |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 127 | push(@layers, ['Base', 'Paragraphs']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 128 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 129 | # OpenNLP |
| 130 | push(@layers, ['OpenNLP', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 131 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 132 | # CoreNLP |
| 133 | push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']); |
| 134 | push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 135 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 136 | # Connexor |
| 137 | push(@layers, ['Connexor', 'Morpho']); |
| 138 | push(@layers, ['Connexor', 'Syntax']); |
| 139 | push(@layers, ['Connexor', 'Phrase']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 140 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 141 | # TreeTagger |
| 142 | push(@layers, ['TreeTagger', 'Morpho']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 143 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 144 | # Mate |
| 145 | push(@layers, ['Mate', 'Morpho']); |
| 146 | push(@layers, ['Mate', 'Dependency']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 147 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 148 | # XIP |
| 149 | push(@layers, ['XIP', 'Morpho']); |
| 150 | push(@layers, ['XIP', 'Constituency']); |
| 151 | push(@layers, ['XIP', 'Dependency']); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 152 | |
| 153 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 154 | if ($skip{'#all'}) { |
| 155 | foreach (@allow) { |
| 156 | $tokens->add(split('#', $_)); |
| 157 | stop_time; |
| 158 | }; |
| 159 | } |
| 160 | else { |
| 161 | # Add to index file - respect skipping |
| 162 | foreach my $info (@layers) { |
| 163 | unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) { |
| 164 | $tokens->add(@$info); |
| 165 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 166 | }; |
| 167 | }; |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 168 | }; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 169 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 170 | my $file; |
| 171 | |
| 172 | my $print_text = $text ? $tokens->to_string($primary) : ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary)); |
| 173 | |
| 174 | if ($output) { |
| 175 | if ($gzip) { |
| 176 | $file = IO::Compress::Gzip->new($output, Minimal => 1); |
| 177 | } |
| 178 | else { |
| 179 | $file = IO::File->new($output, "w"); |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 180 | }; |
| 181 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 182 | # binmode $file, ':utf8'; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 183 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 184 | $file->print($print_text); |
| 185 | $file->close; |
| 186 | } |
| 187 | else { |
| 188 | # binmode STDOUT, ':utf8'; |
| 189 | print $print_text . "\n"; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 190 | }; |
| 191 | |
Nils Diewald | 7364d1f | 2013-11-05 19:26:35 +0000 | [diff] [blame] | 192 | stop_time; |
Nils Diewald | 2db9ad0 | 2013-10-29 19:26:43 +0000 | [diff] [blame] | 193 | |
| 194 | __END__ |