blob: 6d8611d943fe3a4a9095bb5f2745826cefe3368a [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Nils Diewald2db9ad02013-10-29 19:26:43 +00004use lib 'lib', '../lib';
Nils Diewald7364d1f2013-11-05 19:26:35 +00005use Getopt::Long;
6use Benchmark qw/:hireswallclock/;
7use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use Log::Log4perl;
Nils Diewald2db9ad02013-10-29 19:26:43 +00009use KorAP::Document;
10use KorAP::Tokenizer;
11
Nils Diewald32e30f02014-10-30 00:52:36 +000012our $VERSION = 0.02;
Nils Diewald7364d1f2013-11-05 19:26:35 +000013
14# Merges foundry data to create indexer friendly documents
Nils Diewald32e30f02014-10-30 00:52:36 +000015# ndiewald, 2014/10/29
Nils Diewald7364d1f2013-11-05 19:26:35 +000016
17sub printhelp {
18 print <<'EOHELP';
19
20Merge foundry data based on a tokenization and create indexer friendly documents.
21
22Call:
Nils Diewald092178e2013-11-26 16:18:48 +000023prepare_index.pl -z --input <directory> --output <filename>
Nils Diewald7364d1f2013-11-05 19:26:35 +000024
25--input|-i <directory> Directory of the document to index
26--output|-o <filename> Document name for output (optional),
27 Writes to <STDOUT> by default
28--token|-t <foundry>[#<layer>] Define the default tokenization by specifying
29 the name of the foundry and optionally the name
30 of the layer. Defaults to OpenNLP#tokens.
31--skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
32 or specific layers by defining the name
33 with a # in front of the foundry,
34 e.g. Mate#Morpho. Alternatively you can skip #ALL.
35 Can be set multiple times.
36--allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
37 combining the foundry name with a # and the layer name.
38--primary|-p Output primary data or not. Defaults to true.
39 Can be flagged using --no-primary as well.
40--human|-m Represent the data human friendly,
41 while the output defaults to JSON
42--pretty|-y Pretty print json output
43--gzip|-z Compress the output
44 (expects a defined output file)
45--log|-l The Log4perl log level, defaults to ERROR.
46--help|-h Print this document (optional)
47
48diewald@ids-mannheim.de, 2013/11/04
49
50EOHELP
51 exit(defined $_[0] ? $_[0] : 0);
52};
53
54# Options from the command line
55my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty);
56GetOptions(
Nils Diewald092178e2013-11-26 16:18:48 +000057 'input|i=s' => \$input,
Nils Diewald7364d1f2013-11-05 19:26:35 +000058 'output|o=s' => \$output,
59 'human|m' => \$text,
60 'token|t=s' => \$token_base,
61 'gzip|z' => \$gzip,
62 'skip|s=s' => \@skip,
63 'log|l=s' => \$log_level,
64 'allow|a=s' => \@allow,
65 'primary|p!' => \$primary,
66 'pretty|y' => \$pretty,
67 'help|h' => sub { printhelp }
68);
69
70printhelp(1) if !$input || ($gzip && !$output);
71
72$log_level //= 'ERROR';
73
74my %skip;
75$skip{lc($_)} = 1 foreach @skip;
76
77Log::Log4perl->init({
78 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
79 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
80 'log4perl.appender.STDERR.layout' => 'PatternLayout',
81 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
82});
83
84my $log = Log::Log4perl->get_logger('main');
85
86BEGIN {
87 $main::TIME = Benchmark->new;
88 $main::LAST_STOP = Benchmark->new;
89};
90
91sub stop_time {
92 my $new = Benchmark->new;
93 $log->trace(
94 'The code took: '.
95 timestr(timediff($new, $main::LAST_STOP)) .
96 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
97 );
98 $main::LAST_STOP = $new;
99};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000100
101# Call perl script/prepare_index.pl WPD/AAA/00001
102
Nils Diewald7364d1f2013-11-05 19:26:35 +0000103# Create and parse new document
104$input =~ s{([^/])$}{$1/};
105my $doc = KorAP::Document->new( path => $input );
106$doc->parse;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000107
Nils Diewald7364d1f2013-11-05 19:26:35 +0000108my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
109if ($token_base) {
110 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
111};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000112
Nils Diewald7364d1f2013-11-05 19:26:35 +0000113# Get tokenization
114my $tokens = KorAP::Tokenizer->new(
115 path => $doc->path,
116 doc => $doc,
117 foundry => $token_base_foundry,
118 layer => $token_base_layer,
119 name => 'tokens'
120);
121$tokens->parse;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000122
Nils Diewald7364d1f2013-11-05 19:26:35 +0000123my @layers;
Nils Diewald37e5b572013-11-20 20:26:03 +0000124push(@layers, ['Base', 'Sentences']);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000125push(@layers, ['Base', 'Paragraphs']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000126
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127# OpenNLP
128push(@layers, ['OpenNLP', 'Morpho']);
Nils Diewald7b847222014-04-23 11:14:00 +0000129push(@layers, ['OpenNLP', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000130
Nils Diewald7364d1f2013-11-05 19:26:35 +0000131# CoreNLP
Nils Diewald02d100e2014-10-31 17:51:19 +0000132push(@layers, ['CoreNLP', 'NamedEntities']);
Nils Diewald7b847222014-04-23 11:14:00 +0000133push(@layers, ['CoreNLP', 'Sentences']);
Nils Diewald02d100e2014-10-31 17:51:19 +0000134push(@layers, ['CoreNLP', 'Morpho']);
135push(@layers, ['CoreNLP', 'Constituency']);
136
137# Glemm
138push(@layers, ['Glemm', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000139
Nils Diewald7364d1f2013-11-05 19:26:35 +0000140# Connexor
141push(@layers, ['Connexor', 'Morpho']);
142push(@layers, ['Connexor', 'Syntax']);
143push(@layers, ['Connexor', 'Phrase']);
Nils Diewald7b847222014-04-23 11:14:00 +0000144push(@layers, ['Connexor', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000145
Nils Diewald7364d1f2013-11-05 19:26:35 +0000146# TreeTagger
147push(@layers, ['TreeTagger', 'Morpho']);
Nils Diewald7b847222014-04-23 11:14:00 +0000148push(@layers, ['TreeTagger', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000149
Nils Diewald7364d1f2013-11-05 19:26:35 +0000150# Mate
151push(@layers, ['Mate', 'Morpho']);
Nils Diewald02d100e2014-10-31 17:51:19 +0000152# push(@layers, ['Mate', 'Dependency']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000153
Nils Diewald7364d1f2013-11-05 19:26:35 +0000154# XIP
155push(@layers, ['XIP', 'Morpho']);
156push(@layers, ['XIP', 'Constituency']);
Nils Diewald7b847222014-04-23 11:14:00 +0000157push(@layers, ['XIP', 'Sentences']);
Nils Diewald02d100e2014-10-31 17:51:19 +0000158# push(@layers, ['XIP', 'Dependency']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000159
160
Nils Diewald7364d1f2013-11-05 19:26:35 +0000161if ($skip{'#all'}) {
162 foreach (@allow) {
163 $tokens->add(split('#', $_));
164 stop_time;
165 };
166}
167else {
168 # Add to index file - respect skipping
169 foreach my $info (@layers) {
170 unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) {
171 $tokens->add(@$info);
172 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000173 };
174 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000175};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000176
Nils Diewald7364d1f2013-11-05 19:26:35 +0000177my $file;
178
179my $print_text = $text ? $tokens->to_string($primary) : ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
180
181if ($output) {
182 if ($gzip) {
183 $file = IO::Compress::Gzip->new($output, Minimal => 1);
184 }
185 else {
186 $file = IO::File->new($output, "w");
Nils Diewald2db9ad02013-10-29 19:26:43 +0000187 };
188
Nils Diewald7364d1f2013-11-05 19:26:35 +0000189 $file->print($print_text);
190 $file->close;
191}
192else {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000193 print $print_text . "\n";
Nils Diewald2db9ad02013-10-29 19:26:43 +0000194};
195
Nils Diewald7364d1f2013-11-05 19:26:35 +0000196stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000197
198__END__