blob: 96c3b236a9511b04ee87bacc94c42da541aa50a4 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Nils Diewald2db9ad02013-10-29 19:26:43 +00004use lib 'lib', '../lib';
Nils Diewald7364d1f2013-11-05 19:26:35 +00005use Getopt::Long;
6use Benchmark qw/:hireswallclock/;
7use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use Log::Log4perl;
Nils Diewald2db9ad02013-10-29 19:26:43 +00009
Akron93d620e2016-02-05 19:40:05 +010010use KorAP::XML::Krill;
11use KorAP::XML::Tokenizer;
12
13our $VERSION = 0.04;
Nils Diewald7364d1f2013-11-05 19:26:35 +000014
15# Merges foundry data to create indexer friendly documents
Nils Diewald32e30f02014-10-30 00:52:36 +000016# ndiewald, 2014/10/29
Nils Diewald7364d1f2013-11-05 19:26:35 +000017
Akron93d620e2016-02-05 19:40:05 +010018# 2016/02/04
19# - renamed to korapxml2krill
20# - added Schreibgebrauch support
21
Nils Diewald7364d1f2013-11-05 19:26:35 +000022sub printhelp {
23 print <<'EOHELP';
24
25Merge foundry data based on a tokenization and create indexer friendly documents.
26
27Call:
Akron93d620e2016-02-05 19:40:05 +010028korapxml2krill -z --input <directory> --output <filename>
Nils Diewald7364d1f2013-11-05 19:26:35 +000029
30--input|-i <directory> Directory of the document to index
31--output|-o <filename> Document name for output (optional),
32 Writes to <STDOUT> by default
Nils Diewald59094f22014-11-05 18:20:50 +000033--overwrite|-w Overwrite files that already exist
Nils Diewald7364d1f2013-11-05 19:26:35 +000034--token|-t <foundry>[#<layer>] Define the default tokenization by specifying
35 the name of the foundry and optionally the name
36 of the layer. Defaults to OpenNLP#tokens.
37--skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
38 or specific layers by defining the name
39 with a # in front of the foundry,
40 e.g. Mate#Morpho. Alternatively you can skip #ALL.
41 Can be set multiple times.
42--allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
43 combining the foundry name with a # and the layer name.
44--primary|-p Output primary data or not. Defaults to true.
45 Can be flagged using --no-primary as well.
46--human|-m Represent the data human friendly,
47 while the output defaults to JSON
48--pretty|-y Pretty print json output
49--gzip|-z Compress the output
50 (expects a defined output file)
51--log|-l The Log4perl log level, defaults to ERROR.
52--help|-h Print this document (optional)
53
Akron93d620e2016-02-05 19:40:05 +010054diewald@ids-mannheim.de, 2016/02/04
Nils Diewald7364d1f2013-11-05 19:26:35 +000055
56EOHELP
57 exit(defined $_[0] ? $_[0] : 0);
58};
59
60# Options from the command line
Nils Diewald59094f22014-11-05 18:20:50 +000061my ($input, $output, $text, $gzip, $log_level, @skip, $token_base,
62 $primary, @allow, $pretty, $overwrite);
Nils Diewald7364d1f2013-11-05 19:26:35 +000063GetOptions(
Nils Diewald092178e2013-11-26 16:18:48 +000064 'input|i=s' => \$input,
Nils Diewald7364d1f2013-11-05 19:26:35 +000065 'output|o=s' => \$output,
Nils Diewald59094f22014-11-05 18:20:50 +000066 'overwrite|w' => \$overwrite,
Nils Diewald7364d1f2013-11-05 19:26:35 +000067 'human|m' => \$text,
68 'token|t=s' => \$token_base,
69 'gzip|z' => \$gzip,
70 'skip|s=s' => \@skip,
71 'log|l=s' => \$log_level,
72 'allow|a=s' => \@allow,
73 'primary|p!' => \$primary,
74 'pretty|y' => \$pretty,
75 'help|h' => sub { printhelp }
76);
77
78printhelp(1) if !$input || ($gzip && !$output);
79
80$log_level //= 'ERROR';
81
82my %skip;
83$skip{lc($_)} = 1 foreach @skip;
84
85Log::Log4perl->init({
86 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
87 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
88 'log4perl.appender.STDERR.layout' => 'PatternLayout',
89 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
90});
91
92my $log = Log::Log4perl->get_logger('main');
93
Nils Diewald59094f22014-11-05 18:20:50 +000094# Ignore processing
95if (!$overwrite && $output && -e $output) {
96 $log->trace($output . ' already exists');
97 exit(0);
98};
99
Nils Diewald7364d1f2013-11-05 19:26:35 +0000100BEGIN {
101 $main::TIME = Benchmark->new;
102 $main::LAST_STOP = Benchmark->new;
103};
104
105sub stop_time {
106 my $new = Benchmark->new;
107 $log->trace(
108 'The code took: '.
109 timestr(timediff($new, $main::LAST_STOP)) .
110 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
111 );
112 $main::LAST_STOP = $new;
113};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000114
Akron93d620e2016-02-05 19:40:05 +0100115# Call perl script/korapxml2krill WPD/AAA/00001
Nils Diewald2db9ad02013-10-29 19:26:43 +0000116
Nils Diewald7364d1f2013-11-05 19:26:35 +0000117# Create and parse new document
118$input =~ s{([^/])$}{$1/};
Akron93d620e2016-02-05 19:40:05 +0100119my $doc = KorAP::XML::Krill->new( path => $input );
Nils Diewald59094f22014-11-05 18:20:50 +0000120
121unless ($doc->parse) {
Nils Diewald93a01db2014-11-05 18:22:17 +0000122 $log->warn($output . " can't be processed - no document data");
Nils Diewald59094f22014-11-05 18:20:50 +0000123 exit(0);
124};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000125
Nils Diewald7364d1f2013-11-05 19:26:35 +0000126my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
127if ($token_base) {
128 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
129};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000130
Nils Diewald7364d1f2013-11-05 19:26:35 +0000131# Get tokenization
Akron93d620e2016-02-05 19:40:05 +0100132my $tokens = KorAP::XML::Tokenizer->new(
Nils Diewald7364d1f2013-11-05 19:26:35 +0000133 path => $doc->path,
134 doc => $doc,
135 foundry => $token_base_foundry,
136 layer => $token_base_layer,
137 name => 'tokens'
138);
Nils Diewald59094f22014-11-05 18:20:50 +0000139
140# Unable to process base tokenization
141unless ($tokens->parse) {
Nils Diewald93a01db2014-11-05 18:22:17 +0000142 $log->error($output . " can't be processed - no base tokenization");
Nils Diewald59094f22014-11-05 18:20:50 +0000143 exit(0);
144};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000145
Nils Diewald7364d1f2013-11-05 19:26:35 +0000146my @layers;
Nils Diewald37e5b572013-11-20 20:26:03 +0000147push(@layers, ['Base', 'Sentences']);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000148push(@layers, ['Base', 'Paragraphs']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000149
Akron14ca9f02016-01-29 19:38:18 +0100150# Connexor
151push(@layers, ['Connexor', 'Morpho']);
152push(@layers, ['Connexor', 'Syntax']);
153push(@layers, ['Connexor', 'Phrase']);
154push(@layers, ['Connexor', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000155
Nils Diewald7364d1f2013-11-05 19:26:35 +0000156# CoreNLP
Nils Diewald02d100e2014-10-31 17:51:19 +0000157push(@layers, ['CoreNLP', 'NamedEntities']);
Nils Diewald7b847222014-04-23 11:14:00 +0000158push(@layers, ['CoreNLP', 'Sentences']);
Nils Diewald02d100e2014-10-31 17:51:19 +0000159push(@layers, ['CoreNLP', 'Morpho']);
160push(@layers, ['CoreNLP', 'Constituency']);
161
Akron14ca9f02016-01-29 19:38:18 +0100162# DeReKo
163push(@layers, ['DeReKo', 'Structure']);
164
Nils Diewald02d100e2014-10-31 17:51:19 +0000165# Glemm
166push(@layers, ['Glemm', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000167
Akron14ca9f02016-01-29 19:38:18 +0100168# Malt
Akron93d620e2016-02-05 19:40:05 +0100169# push(@layers, ['Malt', 'Dependency']);
Akron14ca9f02016-01-29 19:38:18 +0100170
171# Mate
172push(@layers, ['Mate', 'Morpho']);
173push(@layers, ['Mate', 'Dependency']);
174
175# OpenNLP
176push(@layers, ['OpenNLP', 'Morpho']);
177push(@layers, ['OpenNLP', 'Sentences']);
178
179# Schreibgebrauch
Akron93d620e2016-02-05 19:40:05 +0100180push(@layers, ['Sgbr', 'Lemma']);
181push(@layers, ['Sgbr', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000182
Nils Diewald7364d1f2013-11-05 19:26:35 +0000183# TreeTagger
184push(@layers, ['TreeTagger', 'Morpho']);
Nils Diewald7b847222014-04-23 11:14:00 +0000185push(@layers, ['TreeTagger', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000186
Nils Diewald7364d1f2013-11-05 19:26:35 +0000187# XIP
188push(@layers, ['XIP', 'Morpho']);
189push(@layers, ['XIP', 'Constituency']);
Nils Diewald7b847222014-04-23 11:14:00 +0000190push(@layers, ['XIP', 'Sentences']);
Akron14ca9f02016-01-29 19:38:18 +0100191push(@layers, ['XIP', 'Dependency']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000192
193
Nils Diewald7364d1f2013-11-05 19:26:35 +0000194if ($skip{'#all'}) {
195 foreach (@allow) {
196 $tokens->add(split('#', $_));
197 stop_time;
198 };
199}
200else {
201 # Add to index file - respect skipping
202 foreach my $info (@layers) {
203 unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) {
204 $tokens->add(@$info);
205 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000206 };
207 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000208};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000209
Nils Diewald7364d1f2013-11-05 19:26:35 +0000210my $file;
211
Nils Diewald59094f22014-11-05 18:20:50 +0000212my $print_text = $text ? $tokens->to_string($primary) :
213 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000214
215if ($output) {
Nils Diewald59094f22014-11-05 18:20:50 +0000216
Nils Diewald7364d1f2013-11-05 19:26:35 +0000217 if ($gzip) {
218 $file = IO::Compress::Gzip->new($output, Minimal => 1);
219 }
220 else {
221 $file = IO::File->new($output, "w");
Nils Diewald2db9ad02013-10-29 19:26:43 +0000222 };
223
Nils Diewald7364d1f2013-11-05 19:26:35 +0000224 $file->print($print_text);
225 $file->close;
226}
Nils Diewald59094f22014-11-05 18:20:50 +0000227
Nils Diewald7364d1f2013-11-05 19:26:35 +0000228else {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000229 print $print_text . "\n";
Nils Diewald2db9ad02013-10-29 19:26:43 +0000230};
231
Nils Diewald7364d1f2013-11-05 19:26:35 +0000232stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000233
234__END__