blob: 486032a903bdb3df48fd5c890b03ca64a3a973bb [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Nils Diewald2db9ad02013-10-29 19:26:43 +00004use lib 'lib', '../lib';
Nils Diewald7364d1f2013-11-05 19:26:35 +00005use Getopt::Long;
6use Benchmark qw/:hireswallclock/;
7use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use Log::Log4perl;
Nils Diewald2db9ad02013-10-29 19:26:43 +00009use KorAP::Document;
10use KorAP::Tokenizer;
11
Nils Diewald59094f22014-11-05 18:20:50 +000012our $VERSION = 0.03;
Nils Diewald7364d1f2013-11-05 19:26:35 +000013
14# Merges foundry data to create indexer friendly documents
Nils Diewald32e30f02014-10-30 00:52:36 +000015# ndiewald, 2014/10/29
Nils Diewald7364d1f2013-11-05 19:26:35 +000016
17sub printhelp {
18 print <<'EOHELP';
19
20Merge foundry data based on a tokenization and create indexer friendly documents.
21
22Call:
Nils Diewald092178e2013-11-26 16:18:48 +000023prepare_index.pl -z --input <directory> --output <filename>
Nils Diewald7364d1f2013-11-05 19:26:35 +000024
25--input|-i <directory> Directory of the document to index
26--output|-o <filename> Document name for output (optional),
27 Writes to <STDOUT> by default
Nils Diewald59094f22014-11-05 18:20:50 +000028--overwrite|-w Overwrite files that already exist
Nils Diewald7364d1f2013-11-05 19:26:35 +000029--token|-t <foundry>[#<layer>] Define the default tokenization by specifying
30 the name of the foundry and optionally the name
31 of the layer. Defaults to OpenNLP#tokens.
32--skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
33 or specific layers by defining the name
34 with a # in front of the foundry,
35 e.g. Mate#Morpho. Alternatively you can skip #ALL.
36 Can be set multiple times.
37--allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
38 combining the foundry name with a # and the layer name.
39--primary|-p Output primary data or not. Defaults to true.
40 Can be flagged using --no-primary as well.
41--human|-m Represent the data human friendly,
42 while the output defaults to JSON
43--pretty|-y Pretty print json output
44--gzip|-z Compress the output
45 (expects a defined output file)
46--log|-l The Log4perl log level, defaults to ERROR.
47--help|-h Print this document (optional)
48
Nils Diewald59094f22014-11-05 18:20:50 +000049diewald@ids-mannheim.de, 2014/11/05
Nils Diewald7364d1f2013-11-05 19:26:35 +000050
51EOHELP
52 exit(defined $_[0] ? $_[0] : 0);
53};
54
55# Options from the command line
Nils Diewald59094f22014-11-05 18:20:50 +000056my ($input, $output, $text, $gzip, $log_level, @skip, $token_base,
57 $primary, @allow, $pretty, $overwrite);
Nils Diewald7364d1f2013-11-05 19:26:35 +000058GetOptions(
Nils Diewald092178e2013-11-26 16:18:48 +000059 'input|i=s' => \$input,
Nils Diewald7364d1f2013-11-05 19:26:35 +000060 'output|o=s' => \$output,
Nils Diewald59094f22014-11-05 18:20:50 +000061 'overwrite|w' => \$overwrite,
Nils Diewald7364d1f2013-11-05 19:26:35 +000062 'human|m' => \$text,
63 'token|t=s' => \$token_base,
64 'gzip|z' => \$gzip,
65 'skip|s=s' => \@skip,
66 'log|l=s' => \$log_level,
67 'allow|a=s' => \@allow,
68 'primary|p!' => \$primary,
69 'pretty|y' => \$pretty,
70 'help|h' => sub { printhelp }
71);
72
73printhelp(1) if !$input || ($gzip && !$output);
74
75$log_level //= 'ERROR';
76
77my %skip;
78$skip{lc($_)} = 1 foreach @skip;
79
80Log::Log4perl->init({
81 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
82 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
83 'log4perl.appender.STDERR.layout' => 'PatternLayout',
84 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
85});
86
87my $log = Log::Log4perl->get_logger('main');
88
Nils Diewald59094f22014-11-05 18:20:50 +000089# Ignore processing
90if (!$overwrite && $output && -e $output) {
91 $log->trace($output . ' already exists');
92 exit(0);
93};
94
Nils Diewald7364d1f2013-11-05 19:26:35 +000095BEGIN {
96 $main::TIME = Benchmark->new;
97 $main::LAST_STOP = Benchmark->new;
98};
99
100sub stop_time {
101 my $new = Benchmark->new;
102 $log->trace(
103 'The code took: '.
104 timestr(timediff($new, $main::LAST_STOP)) .
105 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
106 );
107 $main::LAST_STOP = $new;
108};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000109
110# Call perl script/prepare_index.pl WPD/AAA/00001
111
Nils Diewald7364d1f2013-11-05 19:26:35 +0000112# Create and parse new document
113$input =~ s{([^/])$}{$1/};
114my $doc = KorAP::Document->new( path => $input );
Nils Diewald59094f22014-11-05 18:20:50 +0000115
116unless ($doc->parse) {
Nils Diewald93a01db2014-11-05 18:22:17 +0000117 $log->warn($output . " can't be processed - no document data");
Nils Diewald59094f22014-11-05 18:20:50 +0000118 exit(0);
119};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000120
Nils Diewald7364d1f2013-11-05 19:26:35 +0000121my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
122if ($token_base) {
123 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
124};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000125
Nils Diewald7364d1f2013-11-05 19:26:35 +0000126# Get tokenization
127my $tokens = KorAP::Tokenizer->new(
128 path => $doc->path,
129 doc => $doc,
130 foundry => $token_base_foundry,
131 layer => $token_base_layer,
132 name => 'tokens'
133);
Nils Diewald59094f22014-11-05 18:20:50 +0000134
135# Unable to process base tokenization
136unless ($tokens->parse) {
Nils Diewald93a01db2014-11-05 18:22:17 +0000137 $log->error($output . " can't be processed - no base tokenization");
Nils Diewald59094f22014-11-05 18:20:50 +0000138 exit(0);
139};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000140
Nils Diewald7364d1f2013-11-05 19:26:35 +0000141my @layers;
Nils Diewald37e5b572013-11-20 20:26:03 +0000142push(@layers, ['Base', 'Sentences']);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143push(@layers, ['Base', 'Paragraphs']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000144
Akron14ca9f02016-01-29 19:38:18 +0100145# Connexor
146push(@layers, ['Connexor', 'Morpho']);
147push(@layers, ['Connexor', 'Syntax']);
148push(@layers, ['Connexor', 'Phrase']);
149push(@layers, ['Connexor', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000150
Nils Diewald7364d1f2013-11-05 19:26:35 +0000151# CoreNLP
Nils Diewald02d100e2014-10-31 17:51:19 +0000152push(@layers, ['CoreNLP', 'NamedEntities']);
Nils Diewald7b847222014-04-23 11:14:00 +0000153push(@layers, ['CoreNLP', 'Sentences']);
Nils Diewald02d100e2014-10-31 17:51:19 +0000154push(@layers, ['CoreNLP', 'Morpho']);
155push(@layers, ['CoreNLP', 'Constituency']);
156
Akron14ca9f02016-01-29 19:38:18 +0100157# DeReKo
158push(@layers, ['DeReKo', 'Structure']);
159
Nils Diewald02d100e2014-10-31 17:51:19 +0000160# Glemm
161push(@layers, ['Glemm', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000162
Akron14ca9f02016-01-29 19:38:18 +0100163# Malt
164push(@layers, ['Malt', 'Dependency']);
165
166# Mate
167push(@layers, ['Mate', 'Morpho']);
168push(@layers, ['Mate', 'Dependency']);
169
170# OpenNLP
171push(@layers, ['OpenNLP', 'Morpho']);
172push(@layers, ['OpenNLP', 'Sentences']);
173
174# Schreibgebrauch
175push(@layers, ['Schreibgebrauch', 'Lemma']);
176push(@layers, ['Schreibgebrauch', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000177
Nils Diewald7364d1f2013-11-05 19:26:35 +0000178# TreeTagger
179push(@layers, ['TreeTagger', 'Morpho']);
Nils Diewald7b847222014-04-23 11:14:00 +0000180push(@layers, ['TreeTagger', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000181
Nils Diewald2db9ad02013-10-29 19:26:43 +0000182
Nils Diewald7364d1f2013-11-05 19:26:35 +0000183# XIP
184push(@layers, ['XIP', 'Morpho']);
185push(@layers, ['XIP', 'Constituency']);
Nils Diewald7b847222014-04-23 11:14:00 +0000186push(@layers, ['XIP', 'Sentences']);
Akron14ca9f02016-01-29 19:38:18 +0100187push(@layers, ['XIP', 'Dependency']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000188
189
Nils Diewald7364d1f2013-11-05 19:26:35 +0000190if ($skip{'#all'}) {
191 foreach (@allow) {
192 $tokens->add(split('#', $_));
193 stop_time;
194 };
195}
196else {
197 # Add to index file - respect skipping
198 foreach my $info (@layers) {
199 unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) {
200 $tokens->add(@$info);
201 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000202 };
203 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000204};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000205
Nils Diewald7364d1f2013-11-05 19:26:35 +0000206my $file;
207
Nils Diewald59094f22014-11-05 18:20:50 +0000208my $print_text = $text ? $tokens->to_string($primary) :
209 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000210
211if ($output) {
Nils Diewald59094f22014-11-05 18:20:50 +0000212
Nils Diewald7364d1f2013-11-05 19:26:35 +0000213 if ($gzip) {
214 $file = IO::Compress::Gzip->new($output, Minimal => 1);
215 }
216 else {
217 $file = IO::File->new($output, "w");
Nils Diewald2db9ad02013-10-29 19:26:43 +0000218 };
219
Nils Diewald7364d1f2013-11-05 19:26:35 +0000220 $file->print($print_text);
221 $file->close;
222}
Nils Diewald59094f22014-11-05 18:20:50 +0000223
Nils Diewald7364d1f2013-11-05 19:26:35 +0000224else {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000225 print $print_text . "\n";
Nils Diewald2db9ad02013-10-29 19:26:43 +0000226};
227
Nils Diewald7364d1f2013-11-05 19:26:35 +0000228stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000229
230__END__