blob: 03a808893243dba53cb01ebfe1a81797e7a59eca [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
12use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010013use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010014use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010016use Parallel::ForkManager;
Akron93d620e2016-02-05 19:40:05 +010017
Akron941c1a62016-02-23 17:41:41 +010018# CHANGES:
19# ----------------------------------------------------------
20# 2013/11/25
21# - Initial release
22#
23# 2014/10/29
24# - Merges foundry data to create indexer friendly documents
25#
Akron93d620e2016-02-05 19:40:05 +010026# 2016/02/04
27# - renamed to korapxml2krill
28# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010029#
30# 2016/02/12
31# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010032# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010033#
34# 2016/02/14
35# - Added version information
Akron941c1a62016-02-23 17:41:41 +010036# - Added support for archive files
37#
38# 2016/02/15
39# - Fixed temporary directory bug
40# - Improved skipping before unzipping
41# - Added EXPERIMENTAL concurrency support
42#
43# 2016/02/23
44# - Merge korapxml2krill and korapxml2krill_dir
45# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010046
Akron941c1a62016-02-23 17:41:41 +010047our $LAST_CHANGE = '2016/02/23';
48our $LOCAL = $FindBin::Bin;
49our $VERSION_MSG = <<"VERSION";
50Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
51VERSION
52
53
54# Parse comand
55my $cmd;
56our @ARGV;
57if ($ARGV[0] && index($ARGV[0], '-') != 0) {
58 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010059};
Akron93d620e2016-02-05 19:40:05 +010060
Akron941c1a62016-02-23 17:41:41 +010061# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000062GetOptions(
Akron941c1a62016-02-23 17:41:41 +010063 'input|i=s' => \(my $input),
64 'output|o=s' => \(my $output),
65 'overwrite|w' => \(my $overwrite),
66 'human|m' => \(my $text),
67 'token|t=s' => \(my $token_base),
68 'gzip|z' => \(my $gzip),
69 'skip|s=s' => \(my @skip),
70 'log|l=s' => \(my $log_level = 'ERROR'),
71 'allow|a=s' => \(my @allow),
72 'primary|p!' => \(my $primary),
73 'pretty|y' => \(my $pretty),
74 'jobs|j=i' => \(my $jobs = 0),
75 'help|h' => sub {
76 pod2usage(
77 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
78 -verbose => 99,
79 -msg => $VERSION_MSG,
80 );
81 },
82 'version|v' => sub {
83 pod2usage(
84 -verbose => 0,
85 -msg => $VERSION_MSG
86 )
87 }
Nils Diewald7364d1f2013-11-05 19:26:35 +000088);
89
Akron941c1a62016-02-23 17:41:41 +010090my %ERROR_HASH = (
91 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
92 -verbose => 99,
93 -msg => $VERSION_MSG,
94 -exit => 1
95);
Nils Diewald7364d1f2013-11-05 19:26:35 +000096
Akron941c1a62016-02-23 17:41:41 +010097# Input has to be defined
98pod2usage(%ERROR_HASH) unless $input;
Nils Diewald7364d1f2013-11-05 19:26:35 +000099
Nils Diewald7364d1f2013-11-05 19:26:35 +0000100
Akron941c1a62016-02-23 17:41:41 +0100101# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000102Log::Log4perl->init({
103 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
104 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
105 'log4perl.appender.STDERR.layout' => 'PatternLayout',
106 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
107});
108
109my $log = Log::Log4perl->get_logger('main');
110
Akron941c1a62016-02-23 17:41:41 +0100111
112# Get file name based on path information
113sub get_file_name ($) {
114 my $file = shift;
115 $file =~ s/^?\/?$input//;
116 $file =~ tr/\//-/;
117 $file =~ s{^-+}{};
118 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000119};
120
Akron941c1a62016-02-23 17:41:41 +0100121
122# Write file
123sub write_file {
124 my $anno = shift;
125 my $file = get_file_name $anno;
126
127 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
128
129 my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
130 $anno . ' -o ' . $output . '/' . $file . '.json';
131 $call .= '.gz -z' if $gzip;
132 $call .= ' -m' if $text;
133 $call .= ' -w' if $overwrite;
134 $call .= ' -t ' . $token_base if $token_base;
135 $call .= ' -l ' . $log_level if $log_level;
136 $call .= ' --no-primary ' if $primary;
137 $call .= ' -y ' . $pretty if $pretty;
138 $call .= ' -a ' . $_ foreach @allow;
139 $call .= ' -s ' . $_ foreach @skip;
140 system($call);
141 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000142};
143
Nils Diewald2db9ad02013-10-29 19:26:43 +0000144
Akron941c1a62016-02-23 17:41:41 +0100145# Process a single file
146unless ($cmd) {
Nils Diewald59094f22014-11-05 18:20:50 +0000147
Akron941c1a62016-02-23 17:41:41 +0100148 # Can't print gzip to STDOUT
149 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000150
Akron941c1a62016-02-23 17:41:41 +0100151 my %skip;
152 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000153
154
Akron941c1a62016-02-23 17:41:41 +0100155 # Ignore processing
156 if (!$overwrite && $output && -e $output) {
157 $log->trace($output . ' already exists');
158 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000159 };
Akron941c1a62016-02-23 17:41:41 +0100160
161 BEGIN {
162 $main::TIME = Benchmark->new;
163 $main::LAST_STOP = Benchmark->new;
164 };
165
166 sub stop_time {
167 my $new = Benchmark->new;
168 $log->trace(
169 'The code took: '.
170 timestr(timediff($new, $main::LAST_STOP)) .
171 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
172 );
173 $main::LAST_STOP = $new;
174 };
175
176 # Create and parse new document
177 $input =~ s{([^/])$}{$1/};
178 my $doc = KorAP::XML::Krill->new( path => $input );
179
180 unless ($doc->parse) {
181 $log->warn($output . " can't be processed - no document data");
182 exit(0);
183 };
184
185 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
186 if ($token_base) {
187 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
188 };
189
190 # Get tokenization
191 my $tokens = KorAP::XML::Tokenizer->new(
192 path => $doc->path,
193 doc => $doc,
194 foundry => $token_base_foundry,
195 layer => $token_base_layer,
196 name => 'tokens'
197 );
198
199 # Unable to process base tokenization
200 unless ($tokens->parse) {
201 $log->error($output . " can't be processed - no base tokenization");
202 exit(0);
203 };
204
205 my @layers;
206 push(@layers, ['Base', 'Sentences']);
207 push(@layers, ['Base', 'Paragraphs']);
208
209 # Connexor
210 push(@layers, ['Connexor', 'Morpho']);
211 push(@layers, ['Connexor', 'Syntax']);
212 push(@layers, ['Connexor', 'Phrase']);
213 push(@layers, ['Connexor', 'Sentences']);
214
215 # CoreNLP
216 push(@layers, ['CoreNLP', 'NamedEntities']);
217 push(@layers, ['CoreNLP', 'Sentences']);
218 push(@layers, ['CoreNLP', 'Morpho']);
219 push(@layers, ['CoreNLP', 'Constituency']);
220
221 # DeReKo
222 push(@layers, ['DeReKo', 'Structure']);
223
224 # Glemm
225 push(@layers, ['Glemm', 'Morpho']);
226
227 # Malt
228 # push(@layers, ['Malt', 'Dependency']);
229
230 # Mate
231 push(@layers, ['Mate', 'Morpho']);
232 push(@layers, ['Mate', 'Dependency']);
233
234 # OpenNLP
235 push(@layers, ['OpenNLP', 'Morpho']);
236 push(@layers, ['OpenNLP', 'Sentences']);
237
238 # Schreibgebrauch
239 push(@layers, ['Sgbr', 'Lemma']);
240 push(@layers, ['Sgbr', 'Morpho']);
241
242 # TreeTagger
243 push(@layers, ['TreeTagger', 'Morpho']);
244 push(@layers, ['TreeTagger', 'Sentences']);
245
246 # XIP
247 push(@layers, ['XIP', 'Morpho']);
248 push(@layers, ['XIP', 'Constituency']);
249 push(@layers, ['XIP', 'Sentences']);
250 push(@layers, ['XIP', 'Dependency']);
251
252
253 if ($skip{'#all'}) {
254 foreach (@allow) {
255 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000256 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000257 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000258 }
259 else {
Akron941c1a62016-02-23 17:41:41 +0100260 # Add to index file - respect skipping
261 foreach my $info (@layers) {
262 # Skip if Foundry or Foundry#Layer should be skipped
263 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
264 $tokens->add(@$info);
265 stop_time;
266 };
267 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000268 };
269
Akron941c1a62016-02-23 17:41:41 +0100270 my $file;
271
272 my $print_text = $text ? $tokens->to_string($primary) :
273 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
274
275 if ($output) {
276
277 if ($gzip) {
278 $file = IO::Compress::Gzip->new($output, Minimal => 1);
279 }
280 else {
281 $file = IO::File->new($output, "w");
282 };
283
284 $file->print($print_text);
285 $file->close;
286 }
287
288 else {
289 print $print_text . "\n";
290 };
291
292 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000293}
Nils Diewald59094f22014-11-05 18:20:50 +0000294
Akron941c1a62016-02-23 17:41:41 +0100295# Process an archive
296elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000297
Akron941c1a62016-02-23 17:41:41 +0100298 pod2usage(%ERROR_HASH) unless $output;
299
300 if ($output && (!-e $output || !-d $output)) {
301 print "Directory '$output' does not exist.\n\n";
302 exit(0);
303 };
304
305 # Zero means: everything runs in the parent process
306 my $pool = Parallel::ForkManager->new($jobs);
307
308 my $count = 0; # Texts to process
309 my $iter = 1; # Current text in process
310
311 # Report on fork message
312 $pool->run_on_finish (
313 sub {
314 my ($pid, $code) = shift;
315 my $data = pop;
316 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
317 ($iter++) . "/$count]" .
318 ($code ? " $code" : '') .
319 " $$data\n";
320 }
321 );
322
323 my $t;
324 print "Reading data ...\n";
325
326 # Input is a directory
327 if (-d $input) {
328 my $it = Directory::Iterator->new($input);
329 my @dirs;
330 my $dir;
331
332 while (1) {
333 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
334 push @dirs, $dir;
335 $it->prune;
336 };
337 last unless $it->next;
338 };
339
340 print "Start processing ...\n";
341 $t = Benchmark->new;
342 $count = scalar @dirs;
343
344 DIRECTORY_LOOP:
345 for (my $i = 0; $i < $count; $i++) {
346
347 unless ($overwrite) {
348 my $filename = catfile(
349 $output,
350 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
351 );
352
353 if (-e $filename) {
354 $iter++;
355 print "Skip $filename\n";
356 next;
357 };
358 };
359
360 # Get the next fork
361 my $pid = $pool->start and next DIRECTORY_LOOP;
362 my $msg;
363
364 $msg = write_file($dirs[$i]);
365 $pool->finish(0, \$msg);
366 };
367 }
368
369 # Input is a file
370 elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
371 unless ($archive->test_unzip) {
372 print "Unzip is not installed or incompatible.\n\n";
373 exit(1);
374 };
375
376 unless ($archive->test) {
377 print "Zip archive not compatible.\n\n";
378 exit(1);
379 };
380
381 print "Start processing ...\n";
382 $t = Benchmark->new;
383 my @dirs = $archive->list_texts;
384 $count = scalar @dirs;
385
386 ARCHIVE_LOOP:
387 for (my $i = 0; $i < $count; $i++) {
388
389 # Split path information
390 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
391
392 unless ($overwrite) {
393 my $filename = catfile(
394 $output,
395 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
396 );
397
398 if (-e $filename) {
399 $iter++;
400 print "Skip $filename\n";
401 next;
402 };
403 };
404
405 # Get the next fork
406 my $pid = $pool->start and next ARCHIVE_LOOP;
407
408 # Create temporary file
409 my $temp = File::Temp->newdir;
410
411 my $msg;
412
413 # Extract from archive
414 if ($archive->extract($dirs[$i], $temp)) {
415
416 # Create corpus directory
417 $input = catdir("$temp", $corpus);
418
419 # Temporary directory
420 my $dir = catdir($input, $doc, $text);
421
422 # Write file
423 $msg = write_file($dir);
424
425 $temp = undef;
426 $pool->finish(0, \$msg);
427 }
428 else {
429
430 $temp = undef;
431 $msg = "Unable to extract " . $dirs[$i] . "\n";
432 $pool->finish(1, \$msg);
433 };
434 };
435 }
436
437 else {
438 print "Input is neither a directory nor an archive.\n\n";
439 };
440
441 $pool->wait_all_children;
442
443 print "Done.\n";
444 print timestr(timediff(Benchmark->new, $t))."\n\n";
445}
446
447# Unknown command
448else {
449 warn "Unknown command '$cmd'.\n\n";
450 pod2usage(%ERROR_HASH);
451}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000452
453__END__
Akron941c1a62016-02-23 17:41:41 +0100454
455=pod
456
457=encoding utf8
458
459=head1 NAME
460
461korapxml2krill - Merge KorapXML data and create Krill friendly documents
462
463
464=head1 SYNOPSIS
465
466 $ korapxml2krill [archive] -z --input <directory> --output <filename>
467
468
469=head1 DESCRIPTION
470
471L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
472compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
473
474
475=head1 INSTALLATION
476
477The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
478
479 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
480
481In case everything went well, the C<korapxml2krill> command line tool will
482be available.
483
484
485=head1 ARGUMENTS
486
487=over 2
488
489=item B<archive>
490
491Process an archive as a Zip-File or a folder of KorAP-XML documents.
492
493=back
494
495
496=head1 OPTIONS
497
498=over 2
499
500=item B<--input|-i> <directory|file>
501
502Directory or archive file of documents to index.
503
504=item B<--output|-o> <directory|file>
505
506Output folder for archive processing or
507document name for single output (optional),
508writes to <STDOUT> by default.
509
510=item B<--overwrite|-w>
511
512Overwrite files that already exist.
513
514=item B<--token|-t> <foundry>[#<file>]
515
516Define the default tokenization by specifying
517the name of the foundry and optionally the name
518of the layer-file. Defaults to OpenNLP#tokens.
519
520=item B<--skip|-s> <foundry>[#<layer>]
521
522Skip specific foundries by specifying the name
523or specific layers by defining the name
524with a # in front of the foundry,
525e.g. Mate#Morpho. Alternatively you can skip #ALL.
526Can be set multiple times.
527
528=item B<--allow|-a> <foundry>#<layer>
529
530Allow specific foundries and layers by defining them
531combining the foundry name with a # and the layer name.
532
533=item B<--primary|-p>
534
535Output primary data or not. Defaults to true.
536Can be flagged using --no-primary as well.
537
538=item B<--jobs|-j>
539
540Define the number of concurrent jobs in seperated forks
541for archive processing, defaults to 0. This is B<EXPERIMENTAL>!
542
543=item B<--human|-m>
544
545Represent the data human friendly, while the output defaults to JSON.
546
547=item B<--pretty|-y>
548
549Pretty print JSON output.
550
551=item B<--gzip|-z>
552
553Compress the output (expects a defined output file in single processing).
554
555=item B<--log|-l>
556
557The L<Log4perl> log level, defaults to C<ERROR>.
558
559=item B<--help|-h>
560
561Print this document.
562
563=item B<--version|-v>
564
565Print version information.
566
567=back
568
569=head1 AVAILABILITY
570
571 https://github.com/KorAP/KorAP-XML-Krill
572
573
574=head1 COPYRIGHT AND LICENSE
575
576Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
577Author: L<Nils Diewald|http://nils-diewald.de/>
578
579L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
580Corpus Analysis Platform at the
581L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
582member of the
583L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
584
585This program is free software published under the
586L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
587
588=cut