blob: 9e2d1e8636fb3b28bba324d407a91ed5d5bf75bb [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Nils Diewald2db9ad02013-10-29 19:26:43 +00004use lib 'lib', '../lib';
Nils Diewald7364d1f2013-11-05 19:26:35 +00005use Getopt::Long;
6use Benchmark qw/:hireswallclock/;
7use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use Log::Log4perl;
Nils Diewald2db9ad02013-10-29 19:26:43 +00009
Akron93d620e2016-02-05 19:40:05 +010010use KorAP::XML::Krill;
11use KorAP::XML::Tokenizer;
12
Nils Diewald7364d1f2013-11-05 19:26:35 +000013# Merges foundry data to create indexer friendly documents
Nils Diewald32e30f02014-10-30 00:52:36 +000014# ndiewald, 2014/10/29
Nils Diewald7364d1f2013-11-05 19:26:35 +000015
Akron93d620e2016-02-05 19:40:05 +010016# 2016/02/04
17# - renamed to korapxml2krill
18# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010019#
20# 2016/02/12
21# - fixed foundry skipping
Akron150b29e2016-02-14 23:06:48 +010022#
23# 2016/02/14
24# - Added version information
Akron069bd712016-02-12 19:09:06 +010025
Akron150b29e2016-02-14 23:06:48 +010026sub printversion {
27 print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
28 exit(1);
29};
Akron93d620e2016-02-05 19:40:05 +010030
Nils Diewald7364d1f2013-11-05 19:26:35 +000031sub printhelp {
32 print <<'EOHELP';
33
34Merge foundry data based on a tokenization and create indexer friendly documents.
35
36Call:
Akron93d620e2016-02-05 19:40:05 +010037korapxml2krill -z --input <directory> --output <filename>
Nils Diewald7364d1f2013-11-05 19:26:35 +000038
Akron069bd712016-02-12 19:09:06 +010039 --input|-i <directory> Directory of the document to index
40 --output|-o <filename> Document name for output (optional),
41 Writes to <STDOUT> by default
42 --overwrite|-w Overwrite files that already exist
43 --token|-t <foundry>[#<layer>] Define the default tokenization by specifying
44 the name of the foundry and optionally the name
45 of the layer. Defaults to OpenNLP#tokens.
46 --skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
47 or specific layers by defining the name
48 with a # in front of the foundry,
49 e.g. Mate#Morpho. Alternatively you can skip #ALL.
50 Can be set multiple times.
51 --allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
52 combining the foundry name with a # and the layer name.
53 --primary|-p Output primary data or not. Defaults to true.
54 Can be flagged using --no-primary as well.
55 --human|-m Represent the data human friendly,
56 while the output defaults to JSON
57 --pretty|-y Pretty print json output
58 --gzip|-z Compress the output
59 (expects a defined output file)
60 --log|-l The Log4perl log level, defaults to ERROR.
61 --help|-h Print this document (optional)
Akron150b29e2016-02-14 23:06:48 +010062 --version|-v Print version information
Nils Diewald7364d1f2013-11-05 19:26:35 +000063
Akron150b29e2016-02-14 23:06:48 +010064diewald@ids-mannheim.de, 2016/02/14
Nils Diewald7364d1f2013-11-05 19:26:35 +000065
66EOHELP
67 exit(defined $_[0] ? $_[0] : 0);
68};
69
70# Options from the command line
Nils Diewald59094f22014-11-05 18:20:50 +000071my ($input, $output, $text, $gzip, $log_level, @skip, $token_base,
72 $primary, @allow, $pretty, $overwrite);
Nils Diewald7364d1f2013-11-05 19:26:35 +000073GetOptions(
Nils Diewald092178e2013-11-26 16:18:48 +000074 'input|i=s' => \$input,
Nils Diewald7364d1f2013-11-05 19:26:35 +000075 'output|o=s' => \$output,
Nils Diewald59094f22014-11-05 18:20:50 +000076 'overwrite|w' => \$overwrite,
Nils Diewald7364d1f2013-11-05 19:26:35 +000077 'human|m' => \$text,
78 'token|t=s' => \$token_base,
79 'gzip|z' => \$gzip,
80 'skip|s=s' => \@skip,
81 'log|l=s' => \$log_level,
82 'allow|a=s' => \@allow,
83 'primary|p!' => \$primary,
84 'pretty|y' => \$pretty,
Akron150b29e2016-02-14 23:06:48 +010085 'help|h' => sub { printhelp },
86 'version|v' => sub { printversion }
Nils Diewald7364d1f2013-11-05 19:26:35 +000087);
88
89printhelp(1) if !$input || ($gzip && !$output);
90
91$log_level //= 'ERROR';
92
93my %skip;
94$skip{lc($_)} = 1 foreach @skip;
95
96Log::Log4perl->init({
97 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
98 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
99 'log4perl.appender.STDERR.layout' => 'PatternLayout',
100 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
101});
102
103my $log = Log::Log4perl->get_logger('main');
104
Nils Diewald59094f22014-11-05 18:20:50 +0000105# Ignore processing
106if (!$overwrite && $output && -e $output) {
107 $log->trace($output . ' already exists');
108 exit(0);
109};
110
Nils Diewald7364d1f2013-11-05 19:26:35 +0000111BEGIN {
112 $main::TIME = Benchmark->new;
113 $main::LAST_STOP = Benchmark->new;
114};
115
116sub stop_time {
117 my $new = Benchmark->new;
118 $log->trace(
119 'The code took: '.
120 timestr(timediff($new, $main::LAST_STOP)) .
121 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
122 );
123 $main::LAST_STOP = $new;
124};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000125
Nils Diewald7364d1f2013-11-05 19:26:35 +0000126# Create and parse new document
127$input =~ s{([^/])$}{$1/};
Akron93d620e2016-02-05 19:40:05 +0100128my $doc = KorAP::XML::Krill->new( path => $input );
Nils Diewald59094f22014-11-05 18:20:50 +0000129
130unless ($doc->parse) {
Nils Diewald93a01db2014-11-05 18:22:17 +0000131 $log->warn($output . " can't be processed - no document data");
Nils Diewald59094f22014-11-05 18:20:50 +0000132 exit(0);
133};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000134
Nils Diewald7364d1f2013-11-05 19:26:35 +0000135my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
136if ($token_base) {
137 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
138};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000139
Nils Diewald7364d1f2013-11-05 19:26:35 +0000140# Get tokenization
Akron93d620e2016-02-05 19:40:05 +0100141my $tokens = KorAP::XML::Tokenizer->new(
Nils Diewald7364d1f2013-11-05 19:26:35 +0000142 path => $doc->path,
143 doc => $doc,
144 foundry => $token_base_foundry,
145 layer => $token_base_layer,
146 name => 'tokens'
147);
Nils Diewald59094f22014-11-05 18:20:50 +0000148
149# Unable to process base tokenization
150unless ($tokens->parse) {
Nils Diewald93a01db2014-11-05 18:22:17 +0000151 $log->error($output . " can't be processed - no base tokenization");
Nils Diewald59094f22014-11-05 18:20:50 +0000152 exit(0);
153};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000154
Nils Diewald7364d1f2013-11-05 19:26:35 +0000155my @layers;
Nils Diewald37e5b572013-11-20 20:26:03 +0000156push(@layers, ['Base', 'Sentences']);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000157push(@layers, ['Base', 'Paragraphs']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000158
Akron14ca9f02016-01-29 19:38:18 +0100159# Connexor
160push(@layers, ['Connexor', 'Morpho']);
161push(@layers, ['Connexor', 'Syntax']);
162push(@layers, ['Connexor', 'Phrase']);
163push(@layers, ['Connexor', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000164
Nils Diewald7364d1f2013-11-05 19:26:35 +0000165# CoreNLP
Nils Diewald02d100e2014-10-31 17:51:19 +0000166push(@layers, ['CoreNLP', 'NamedEntities']);
Nils Diewald7b847222014-04-23 11:14:00 +0000167push(@layers, ['CoreNLP', 'Sentences']);
Nils Diewald02d100e2014-10-31 17:51:19 +0000168push(@layers, ['CoreNLP', 'Morpho']);
169push(@layers, ['CoreNLP', 'Constituency']);
170
Akron14ca9f02016-01-29 19:38:18 +0100171# DeReKo
172push(@layers, ['DeReKo', 'Structure']);
173
Nils Diewald02d100e2014-10-31 17:51:19 +0000174# Glemm
175push(@layers, ['Glemm', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000176
Akron14ca9f02016-01-29 19:38:18 +0100177# Malt
Akron93d620e2016-02-05 19:40:05 +0100178# push(@layers, ['Malt', 'Dependency']);
Akron14ca9f02016-01-29 19:38:18 +0100179
180# Mate
181push(@layers, ['Mate', 'Morpho']);
182push(@layers, ['Mate', 'Dependency']);
183
184# OpenNLP
185push(@layers, ['OpenNLP', 'Morpho']);
186push(@layers, ['OpenNLP', 'Sentences']);
187
188# Schreibgebrauch
Akron93d620e2016-02-05 19:40:05 +0100189push(@layers, ['Sgbr', 'Lemma']);
190push(@layers, ['Sgbr', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000191
Nils Diewald7364d1f2013-11-05 19:26:35 +0000192# TreeTagger
193push(@layers, ['TreeTagger', 'Morpho']);
Nils Diewald7b847222014-04-23 11:14:00 +0000194push(@layers, ['TreeTagger', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000195
Nils Diewald7364d1f2013-11-05 19:26:35 +0000196# XIP
197push(@layers, ['XIP', 'Morpho']);
198push(@layers, ['XIP', 'Constituency']);
Nils Diewald7b847222014-04-23 11:14:00 +0000199push(@layers, ['XIP', 'Sentences']);
Akron14ca9f02016-01-29 19:38:18 +0100200push(@layers, ['XIP', 'Dependency']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000201
202
Nils Diewald7364d1f2013-11-05 19:26:35 +0000203if ($skip{'#all'}) {
204 foreach (@allow) {
205 $tokens->add(split('#', $_));
206 stop_time;
207 };
208}
209else {
210 # Add to index file - respect skipping
211 foreach my $info (@layers) {
Akron069bd712016-02-12 19:09:06 +0100212 # Skip if Foundry or Foundry#Layer should be skipped
213 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000214 $tokens->add(@$info);
215 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000216 };
217 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000218};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000219
Nils Diewald7364d1f2013-11-05 19:26:35 +0000220my $file;
221
Nils Diewald59094f22014-11-05 18:20:50 +0000222my $print_text = $text ? $tokens->to_string($primary) :
223 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000224
225if ($output) {
Nils Diewald59094f22014-11-05 18:20:50 +0000226
Nils Diewald7364d1f2013-11-05 19:26:35 +0000227 if ($gzip) {
228 $file = IO::Compress::Gzip->new($output, Minimal => 1);
229 }
230 else {
231 $file = IO::File->new($output, "w");
Nils Diewald2db9ad02013-10-29 19:26:43 +0000232 };
233
Nils Diewald7364d1f2013-11-05 19:26:35 +0000234 $file->print($print_text);
235 $file->close;
236}
Nils Diewald59094f22014-11-05 18:20:50 +0000237
Nils Diewald7364d1f2013-11-05 19:26:35 +0000238else {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000239 print $print_text . "\n";
Nils Diewald2db9ad02013-10-29 19:26:43 +0000240};
241
Nils Diewald7364d1f2013-11-05 19:26:35 +0000242stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000243
244__END__