blob: 5ec08055ab71dae7040abefb1bb5ced3499417f9 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Nils Diewald2db9ad02013-10-29 19:26:43 +00004use lib 'lib', '../lib';
Nils Diewald7364d1f2013-11-05 19:26:35 +00005use Getopt::Long;
6use Benchmark qw/:hireswallclock/;
7use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +00008use Log::Log4perl;
Nils Diewald2db9ad02013-10-29 19:26:43 +00009
Akron9a04c712016-02-05 19:40:05 +010010use KorAP::XML::Krill;
11use KorAP::XML::Tokenizer;
12
13our $VERSION = 0.04;
Nils Diewald7364d1f2013-11-05 19:26:35 +000014
15# Merges foundry data to create indexer friendly documents
Nils Diewald79a355c2014-10-30 00:52:36 +000016# ndiewald, 2014/10/29
Nils Diewald7364d1f2013-11-05 19:26:35 +000017
Akron9a04c712016-02-05 19:40:05 +010018# 2016/02/04
19# - renamed to korapxml2krill
20# - added Schreibgebrauch support
Akron9078bb92016-02-12 19:09:06 +010021#
22# 2016/02/12
23# - fixed foundry skipping
24
Akron9a04c712016-02-05 19:40:05 +010025
Nils Diewald7364d1f2013-11-05 19:26:35 +000026sub printhelp {
27 print <<'EOHELP';
28
29Merge foundry data based on a tokenization and create indexer friendly documents.
30
31Call:
Akron9a04c712016-02-05 19:40:05 +010032korapxml2krill -z --input <directory> --output <filename>
Nils Diewald7364d1f2013-11-05 19:26:35 +000033
Akron9078bb92016-02-12 19:09:06 +010034 --input|-i <directory> Directory of the document to index
35 --output|-o <filename> Document name for output (optional),
36 Writes to <STDOUT> by default
37 --overwrite|-w Overwrite files that already exist
38 --token|-t <foundry>[#<layer>] Define the default tokenization by specifying
39 the name of the foundry and optionally the name
40 of the layer. Defaults to OpenNLP#tokens.
41 --skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
42 or specific layers by defining the name
43 with a # in front of the foundry,
44 e.g. Mate#Morpho. Alternatively you can skip #ALL.
45 Can be set multiple times.
46 --allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
47 combining the foundry name with a # and the layer name.
48 --primary|-p Output primary data or not. Defaults to true.
49 Can be flagged using --no-primary as well.
50 --human|-m Represent the data human friendly,
51 while the output defaults to JSON
52 --pretty|-y Pretty print json output
53 --gzip|-z Compress the output
54 (expects a defined output file)
55 --log|-l The Log4perl log level, defaults to ERROR.
56 --help|-h Print this document (optional)
Nils Diewald7364d1f2013-11-05 19:26:35 +000057
Akron9078bb92016-02-12 19:09:06 +010058diewald@ids-mannheim.de, 2016/02/12
Nils Diewald7364d1f2013-11-05 19:26:35 +000059
60EOHELP
61 exit(defined $_[0] ? $_[0] : 0);
62};
63
64# Options from the command line
Nils Diewald5b4865f2014-11-05 18:20:50 +000065my ($input, $output, $text, $gzip, $log_level, @skip, $token_base,
66 $primary, @allow, $pretty, $overwrite);
Nils Diewald7364d1f2013-11-05 19:26:35 +000067GetOptions(
Nils Diewald092178e2013-11-26 16:18:48 +000068 'input|i=s' => \$input,
Nils Diewald7364d1f2013-11-05 19:26:35 +000069 'output|o=s' => \$output,
Nils Diewald5b4865f2014-11-05 18:20:50 +000070 'overwrite|w' => \$overwrite,
Nils Diewald7364d1f2013-11-05 19:26:35 +000071 'human|m' => \$text,
72 'token|t=s' => \$token_base,
73 'gzip|z' => \$gzip,
74 'skip|s=s' => \@skip,
75 'log|l=s' => \$log_level,
76 'allow|a=s' => \@allow,
77 'primary|p!' => \$primary,
78 'pretty|y' => \$pretty,
79 'help|h' => sub { printhelp }
80);
81
82printhelp(1) if !$input || ($gzip && !$output);
83
84$log_level //= 'ERROR';
85
86my %skip;
87$skip{lc($_)} = 1 foreach @skip;
88
89Log::Log4perl->init({
90 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
91 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
92 'log4perl.appender.STDERR.layout' => 'PatternLayout',
93 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
94});
95
96my $log = Log::Log4perl->get_logger('main');
97
Nils Diewald5b4865f2014-11-05 18:20:50 +000098# Ignore processing
99if (!$overwrite && $output && -e $output) {
100 $log->trace($output . ' already exists');
101 exit(0);
102};
103
Nils Diewald7364d1f2013-11-05 19:26:35 +0000104BEGIN {
105 $main::TIME = Benchmark->new;
106 $main::LAST_STOP = Benchmark->new;
107};
108
109sub stop_time {
110 my $new = Benchmark->new;
111 $log->trace(
112 'The code took: '.
113 timestr(timediff($new, $main::LAST_STOP)) .
114 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
115 );
116 $main::LAST_STOP = $new;
117};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000118
Nils Diewald7364d1f2013-11-05 19:26:35 +0000119# Create and parse new document
120$input =~ s{([^/])$}{$1/};
Akron9a04c712016-02-05 19:40:05 +0100121my $doc = KorAP::XML::Krill->new( path => $input );
Nils Diewald5b4865f2014-11-05 18:20:50 +0000122
123unless ($doc->parse) {
Nils Diewald34926b42014-11-05 18:22:17 +0000124 $log->warn($output . " can't be processed - no document data");
Nils Diewald5b4865f2014-11-05 18:20:50 +0000125 exit(0);
126};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000127
Nils Diewald7364d1f2013-11-05 19:26:35 +0000128my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
129if ($token_base) {
130 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
131};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000132
Nils Diewald7364d1f2013-11-05 19:26:35 +0000133# Get tokenization
Akron9a04c712016-02-05 19:40:05 +0100134my $tokens = KorAP::XML::Tokenizer->new(
Nils Diewald7364d1f2013-11-05 19:26:35 +0000135 path => $doc->path,
136 doc => $doc,
137 foundry => $token_base_foundry,
138 layer => $token_base_layer,
139 name => 'tokens'
140);
Nils Diewald5b4865f2014-11-05 18:20:50 +0000141
142# Unable to process base tokenization
143unless ($tokens->parse) {
Nils Diewald34926b42014-11-05 18:22:17 +0000144 $log->error($output . " can't be processed - no base tokenization");
Nils Diewald5b4865f2014-11-05 18:20:50 +0000145 exit(0);
146};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000147
Nils Diewald7364d1f2013-11-05 19:26:35 +0000148my @layers;
Nils Diewald37e5b572013-11-20 20:26:03 +0000149push(@layers, ['Base', 'Sentences']);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000150push(@layers, ['Base', 'Paragraphs']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000151
Akron627a80a2016-01-29 19:38:18 +0100152# Connexor
153push(@layers, ['Connexor', 'Morpho']);
154push(@layers, ['Connexor', 'Syntax']);
155push(@layers, ['Connexor', 'Phrase']);
156push(@layers, ['Connexor', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000157
Nils Diewald7364d1f2013-11-05 19:26:35 +0000158# CoreNLP
Nils Diewald7ed12c82014-10-31 17:51:19 +0000159push(@layers, ['CoreNLP', 'NamedEntities']);
Nils Diewald7b847222014-04-23 11:14:00 +0000160push(@layers, ['CoreNLP', 'Sentences']);
Nils Diewald7ed12c82014-10-31 17:51:19 +0000161push(@layers, ['CoreNLP', 'Morpho']);
162push(@layers, ['CoreNLP', 'Constituency']);
163
Akron627a80a2016-01-29 19:38:18 +0100164# DeReKo
165push(@layers, ['DeReKo', 'Structure']);
166
Nils Diewald7ed12c82014-10-31 17:51:19 +0000167# Glemm
168push(@layers, ['Glemm', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000169
Akron627a80a2016-01-29 19:38:18 +0100170# Malt
Akron9a04c712016-02-05 19:40:05 +0100171# push(@layers, ['Malt', 'Dependency']);
Akron627a80a2016-01-29 19:38:18 +0100172
173# Mate
174push(@layers, ['Mate', 'Morpho']);
175push(@layers, ['Mate', 'Dependency']);
176
177# OpenNLP
178push(@layers, ['OpenNLP', 'Morpho']);
179push(@layers, ['OpenNLP', 'Sentences']);
180
181# Schreibgebrauch
Akron9a04c712016-02-05 19:40:05 +0100182push(@layers, ['Sgbr', 'Lemma']);
183push(@layers, ['Sgbr', 'Morpho']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000184
Nils Diewald7364d1f2013-11-05 19:26:35 +0000185# TreeTagger
186push(@layers, ['TreeTagger', 'Morpho']);
Nils Diewald7b847222014-04-23 11:14:00 +0000187push(@layers, ['TreeTagger', 'Sentences']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000188
Nils Diewald7364d1f2013-11-05 19:26:35 +0000189# XIP
190push(@layers, ['XIP', 'Morpho']);
191push(@layers, ['XIP', 'Constituency']);
Nils Diewald7b847222014-04-23 11:14:00 +0000192push(@layers, ['XIP', 'Sentences']);
Akron627a80a2016-01-29 19:38:18 +0100193push(@layers, ['XIP', 'Dependency']);
Nils Diewald2db9ad02013-10-29 19:26:43 +0000194
195
Nils Diewald7364d1f2013-11-05 19:26:35 +0000196if ($skip{'#all'}) {
197 foreach (@allow) {
198 $tokens->add(split('#', $_));
199 stop_time;
200 };
201}
202else {
203 # Add to index file - respect skipping
204 foreach my $info (@layers) {
Akron9078bb92016-02-12 19:09:06 +0100205 # Skip if Foundry or Foundry#Layer should be skipped
206 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000207 $tokens->add(@$info);
208 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000209 };
210 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000211};
Nils Diewald2db9ad02013-10-29 19:26:43 +0000212
Nils Diewald7364d1f2013-11-05 19:26:35 +0000213my $file;
214
Nils Diewald5b4865f2014-11-05 18:20:50 +0000215my $print_text = $text ? $tokens->to_string($primary) :
216 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000217
218if ($output) {
Nils Diewald5b4865f2014-11-05 18:20:50 +0000219
Nils Diewald7364d1f2013-11-05 19:26:35 +0000220 if ($gzip) {
221 $file = IO::Compress::Gzip->new($output, Minimal => 1);
222 }
223 else {
224 $file = IO::File->new($output, "w");
Nils Diewald2db9ad02013-10-29 19:26:43 +0000225 };
226
Nils Diewald7364d1f2013-11-05 19:26:35 +0000227 $file->print($print_text);
228 $file->close;
229}
Nils Diewald5b4865f2014-11-05 18:20:50 +0000230
Nils Diewald7364d1f2013-11-05 19:26:35 +0000231else {
Nils Diewald7364d1f2013-11-05 19:26:35 +0000232 print $print_text . "\n";
Nils Diewald2db9ad02013-10-29 19:26:43 +0000233};
234
Nils Diewald7364d1f2013-11-05 19:26:35 +0000235stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000236
237__END__