blob: f5341850db3fa357a5082dcdd1c5e76802129258 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
12use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010013use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010014use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010016use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010017# TODO: use Parallel::Loops
Akron93d620e2016-02-05 19:40:05 +010018
Akron941c1a62016-02-23 17:41:41 +010019# CHANGES:
20# ----------------------------------------------------------
21# 2013/11/25
22# - Initial release
23#
24# 2014/10/29
25# - Merges foundry data to create indexer friendly documents
26#
Akron93d620e2016-02-05 19:40:05 +010027# 2016/02/04
28# - renamed to korapxml2krill
29# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010030#
31# 2016/02/12
32# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010033# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010034#
35# 2016/02/14
36# - Added version information
Akron941c1a62016-02-23 17:41:41 +010037# - Added support for archive files
38#
39# 2016/02/15
40# - Fixed temporary directory bug
41# - Improved skipping before unzipping
42# - Added EXPERIMENTAL concurrency support
43#
44# 2016/02/23
45# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010046#
47# 2016/02/27
48# - Added extract function
Akron941c1a62016-02-23 17:41:41 +010049# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010050
Akronee130192016-03-03 18:24:53 +010051our $LAST_CHANGE = '2016/03/02';
Akron941c1a62016-02-23 17:41:41 +010052our $LOCAL = $FindBin::Bin;
53our $VERSION_MSG = <<"VERSION";
54Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
55VERSION
56
57
58# Parse comand
59my $cmd;
60our @ARGV;
61if ($ARGV[0] && index($ARGV[0], '-') != 0) {
62 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010063};
Akron93d620e2016-02-05 19:40:05 +010064
Akrone10ad322016-02-27 10:54:26 +010065my (@skip, @sigle);
66
Akron941c1a62016-02-23 17:41:41 +010067# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000068GetOptions(
Akron941c1a62016-02-23 17:41:41 +010069 'input|i=s' => \(my $input),
70 'output|o=s' => \(my $output),
71 'overwrite|w' => \(my $overwrite),
72 'human|m' => \(my $text),
73 'token|t=s' => \(my $token_base),
74 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010075 'skip|s=s' => \@skip,
76 'sigle|sg=s' => \@sigle,
Akron941c1a62016-02-23 17:41:41 +010077 'log|l=s' => \(my $log_level = 'ERROR'),
78 'allow|a=s' => \(my @allow),
79 'primary|p!' => \(my $primary),
80 'pretty|y' => \(my $pretty),
81 'jobs|j=i' => \(my $jobs = 0),
82 'help|h' => sub {
83 pod2usage(
84 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
85 -verbose => 99,
86 -msg => $VERSION_MSG,
87 );
88 },
89 'version|v' => sub {
90 pod2usage(
91 -verbose => 0,
92 -msg => $VERSION_MSG
93 )
94 }
Nils Diewald7364d1f2013-11-05 19:26:35 +000095);
96
Akron941c1a62016-02-23 17:41:41 +010097my %ERROR_HASH = (
98 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
99 -verbose => 99,
100 -msg => $VERSION_MSG,
101 -exit => 1
102);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000103
Akron941c1a62016-02-23 17:41:41 +0100104# Input has to be defined
105pod2usage(%ERROR_HASH) unless $input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000106
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107
Akron941c1a62016-02-23 17:41:41 +0100108# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000109Log::Log4perl->init({
110 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
111 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
112 'log4perl.appender.STDERR.layout' => 'PatternLayout',
113 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
114});
115
116my $log = Log::Log4perl->get_logger('main');
117
Akron941c1a62016-02-23 17:41:41 +0100118
119# Get file name based on path information
120sub get_file_name ($) {
121 my $file = shift;
122 $file =~ s/^?\/?$input//;
123 $file =~ tr/\//-/;
124 $file =~ s{^-+}{};
125 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000126};
127
Akron941c1a62016-02-23 17:41:41 +0100128
129# Write file
130sub write_file {
131 my $anno = shift;
132 my $file = get_file_name $anno;
133
134 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
135
136 my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
137 $anno . ' -o ' . $output . '/' . $file . '.json';
138 $call .= '.gz -z' if $gzip;
139 $call .= ' -m' if $text;
140 $call .= ' -w' if $overwrite;
141 $call .= ' -t ' . $token_base if $token_base;
142 $call .= ' -l ' . $log_level if $log_level;
143 $call .= ' --no-primary ' if $primary;
144 $call .= ' -y ' . $pretty if $pretty;
145 $call .= ' -a ' . $_ foreach @allow;
146 $call .= ' -s ' . $_ foreach @skip;
147 system($call);
148 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000149};
150
Nils Diewald2db9ad02013-10-29 19:26:43 +0000151
Akrone10ad322016-02-27 10:54:26 +0100152# Convert sigle to path construct
153s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
154
Akron941c1a62016-02-23 17:41:41 +0100155# Process a single file
156unless ($cmd) {
Nils Diewald59094f22014-11-05 18:20:50 +0000157
Akron941c1a62016-02-23 17:41:41 +0100158 # Can't print gzip to STDOUT
159 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000160
Akron941c1a62016-02-23 17:41:41 +0100161 my %skip;
162 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000163
164
Akron941c1a62016-02-23 17:41:41 +0100165 # Ignore processing
166 if (!$overwrite && $output && -e $output) {
167 $log->trace($output . ' already exists');
168 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000169 };
Akron941c1a62016-02-23 17:41:41 +0100170
171 BEGIN {
172 $main::TIME = Benchmark->new;
173 $main::LAST_STOP = Benchmark->new;
174 };
175
176 sub stop_time {
177 my $new = Benchmark->new;
178 $log->trace(
179 'The code took: '.
180 timestr(timediff($new, $main::LAST_STOP)) .
181 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
182 );
183 $main::LAST_STOP = $new;
184 };
185
186 # Create and parse new document
187 $input =~ s{([^/])$}{$1/};
188 my $doc = KorAP::XML::Krill->new( path => $input );
189
190 unless ($doc->parse) {
191 $log->warn($output . " can't be processed - no document data");
192 exit(0);
193 };
194
195 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
196 if ($token_base) {
197 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
198 };
199
200 # Get tokenization
201 my $tokens = KorAP::XML::Tokenizer->new(
202 path => $doc->path,
203 doc => $doc,
204 foundry => $token_base_foundry,
205 layer => $token_base_layer,
206 name => 'tokens'
207 );
208
209 # Unable to process base tokenization
210 unless ($tokens->parse) {
211 $log->error($output . " can't be processed - no base tokenization");
212 exit(0);
213 };
214
215 my @layers;
216 push(@layers, ['Base', 'Sentences']);
217 push(@layers, ['Base', 'Paragraphs']);
218
219 # Connexor
220 push(@layers, ['Connexor', 'Morpho']);
221 push(@layers, ['Connexor', 'Syntax']);
222 push(@layers, ['Connexor', 'Phrase']);
223 push(@layers, ['Connexor', 'Sentences']);
224
225 # CoreNLP
226 push(@layers, ['CoreNLP', 'NamedEntities']);
227 push(@layers, ['CoreNLP', 'Sentences']);
228 push(@layers, ['CoreNLP', 'Morpho']);
229 push(@layers, ['CoreNLP', 'Constituency']);
230
231 # DeReKo
232 push(@layers, ['DeReKo', 'Structure']);
233
234 # Glemm
235 push(@layers, ['Glemm', 'Morpho']);
236
237 # Malt
238 # push(@layers, ['Malt', 'Dependency']);
239
240 # Mate
241 push(@layers, ['Mate', 'Morpho']);
242 push(@layers, ['Mate', 'Dependency']);
243
244 # OpenNLP
245 push(@layers, ['OpenNLP', 'Morpho']);
246 push(@layers, ['OpenNLP', 'Sentences']);
247
248 # Schreibgebrauch
249 push(@layers, ['Sgbr', 'Lemma']);
250 push(@layers, ['Sgbr', 'Morpho']);
251
252 # TreeTagger
253 push(@layers, ['TreeTagger', 'Morpho']);
254 push(@layers, ['TreeTagger', 'Sentences']);
255
256 # XIP
257 push(@layers, ['XIP', 'Morpho']);
258 push(@layers, ['XIP', 'Constituency']);
259 push(@layers, ['XIP', 'Sentences']);
260 push(@layers, ['XIP', 'Dependency']);
261
262
263 if ($skip{'#all'}) {
264 foreach (@allow) {
265 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000266 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000267 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000268 }
269 else {
Akron941c1a62016-02-23 17:41:41 +0100270 # Add to index file - respect skipping
271 foreach my $info (@layers) {
272 # Skip if Foundry or Foundry#Layer should be skipped
273 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
274 $tokens->add(@$info);
275 stop_time;
276 };
277 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000278 };
279
Akron941c1a62016-02-23 17:41:41 +0100280 my $file;
281
282 my $print_text = $text ? $tokens->to_string($primary) :
283 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
284
285 if ($output) {
286
287 if ($gzip) {
288 $file = IO::Compress::Gzip->new($output, Minimal => 1);
289 }
290 else {
291 $file = IO::File->new($output, "w");
292 };
293
294 $file->print($print_text);
295 $file->close;
296 }
297
298 else {
299 print $print_text . "\n";
300 };
301
302 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000303}
Nils Diewald59094f22014-11-05 18:20:50 +0000304
Akrone10ad322016-02-27 10:54:26 +0100305# Extract XML files
306elsif ($cmd eq 'extract') {
307
308 pod2usage(%ERROR_HASH) unless $output;
309
310 # TODO: Support sigles and full archives
311
312 if ($output && (!-e $output || !-d $output)) {
313 print "Directory '$output' does not exist.\n\n";
314 exit(0);
315 };
316
317 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
318
319 unless ($archive->test_unzip) {
320 print "Unzip is not installed or incompatible.\n\n";
321 exit(1);
322 };
323
324 # Test will be skipped
325
326 # Iterate over all given sigles and extract
327 foreach (@sigle) {
328 print "$_ ";
329 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
330 print "extracted.\n";
331 };
332
333 print "\n";
334 exit(1);
335 };
336}
337
Akron941c1a62016-02-23 17:41:41 +0100338# Process an archive
339elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000340
Akrone10ad322016-02-27 10:54:26 +0100341 # TODO: Support sigles
342
Akron941c1a62016-02-23 17:41:41 +0100343 pod2usage(%ERROR_HASH) unless $output;
344
345 if ($output && (!-e $output || !-d $output)) {
346 print "Directory '$output' does not exist.\n\n";
347 exit(0);
348 };
349
350 # Zero means: everything runs in the parent process
351 my $pool = Parallel::ForkManager->new($jobs);
352
353 my $count = 0; # Texts to process
354 my $iter = 1; # Current text in process
355
356 # Report on fork message
357 $pool->run_on_finish (
358 sub {
359 my ($pid, $code) = shift;
360 my $data = pop;
361 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
362 ($iter++) . "/$count]" .
363 ($code ? " $code" : '') .
364 " $$data\n";
365 }
366 );
367
368 my $t;
369 print "Reading data ...\n";
370
371 # Input is a directory
372 if (-d $input) {
373 my $it = Directory::Iterator->new($input);
374 my @dirs;
375 my $dir;
376
377 while (1) {
378 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
379 push @dirs, $dir;
380 $it->prune;
381 };
382 last unless $it->next;
383 };
384
385 print "Start processing ...\n";
386 $t = Benchmark->new;
387 $count = scalar @dirs;
388
389 DIRECTORY_LOOP:
390 for (my $i = 0; $i < $count; $i++) {
391
392 unless ($overwrite) {
393 my $filename = catfile(
394 $output,
395 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
396 );
397
398 if (-e $filename) {
399 $iter++;
400 print "Skip $filename\n";
401 next;
402 };
403 };
404
405 # Get the next fork
406 my $pid = $pool->start and next DIRECTORY_LOOP;
407 my $msg;
408
409 $msg = write_file($dirs[$i]);
410 $pool->finish(0, \$msg);
411 };
412 }
413
414 # Input is a file
415 elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
416 unless ($archive->test_unzip) {
417 print "Unzip is not installed or incompatible.\n\n";
418 exit(1);
419 };
420
421 unless ($archive->test) {
422 print "Zip archive not compatible.\n\n";
423 exit(1);
424 };
425
426 print "Start processing ...\n";
427 $t = Benchmark->new;
428 my @dirs = $archive->list_texts;
429 $count = scalar @dirs;
430
431 ARCHIVE_LOOP:
432 for (my $i = 0; $i < $count; $i++) {
433
434 # Split path information
435 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
436
437 unless ($overwrite) {
438 my $filename = catfile(
439 $output,
440 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
441 );
442
443 if (-e $filename) {
444 $iter++;
445 print "Skip $filename\n";
446 next;
447 };
448 };
449
450 # Get the next fork
451 my $pid = $pool->start and next ARCHIVE_LOOP;
452
453 # Create temporary file
454 my $temp = File::Temp->newdir;
455
456 my $msg;
457
458 # Extract from archive
459 if ($archive->extract($dirs[$i], $temp)) {
460
461 # Create corpus directory
462 $input = catdir("$temp", $corpus);
463
464 # Temporary directory
465 my $dir = catdir($input, $doc, $text);
466
467 # Write file
468 $msg = write_file($dir);
469
470 $temp = undef;
471 $pool->finish(0, \$msg);
472 }
473 else {
474
475 $temp = undef;
476 $msg = "Unable to extract " . $dirs[$i] . "\n";
477 $pool->finish(1, \$msg);
478 };
479 };
480 }
481
482 else {
483 print "Input is neither a directory nor an archive.\n\n";
484 };
485
486 $pool->wait_all_children;
487
488 print "Done.\n";
489 print timestr(timediff(Benchmark->new, $t))."\n\n";
490}
491
492# Unknown command
493else {
494 warn "Unknown command '$cmd'.\n\n";
495 pod2usage(%ERROR_HASH);
496}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000497
498__END__
Akron941c1a62016-02-23 17:41:41 +0100499
500=pod
501
502=encoding utf8
503
504=head1 NAME
505
506korapxml2krill - Merge KorapXML data and create Krill friendly documents
507
508
509=head1 SYNOPSIS
510
511 $ korapxml2krill [archive] -z --input <directory> --output <filename>
512
513
514=head1 DESCRIPTION
515
516L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
517compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
518
519
520=head1 INSTALLATION
521
522The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
523
524 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
525
526In case everything went well, the C<korapxml2krill> command line tool will
527be available.
528
529
530=head1 ARGUMENTS
531
532=over 2
533
534=item B<archive>
535
Akrone10ad322016-02-27 10:54:26 +0100536Process an archive as a Zip-file or a folder of KorAP-XML documents.
537
538=item B<extract>
539
540Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100541
542=back
543
544
545=head1 OPTIONS
546
547=over 2
548
549=item B<--input|-i> <directory|file>
550
551Directory or archive file of documents to index.
552
553=item B<--output|-o> <directory|file>
554
555Output folder for archive processing or
556document name for single output (optional),
557writes to <STDOUT> by default.
558
559=item B<--overwrite|-w>
560
561Overwrite files that already exist.
562
563=item B<--token|-t> <foundry>[#<file>]
564
565Define the default tokenization by specifying
566the name of the foundry and optionally the name
567of the layer-file. Defaults to OpenNLP#tokens.
568
569=item B<--skip|-s> <foundry>[#<layer>]
570
571Skip specific foundries by specifying the name
572or specific layers by defining the name
573with a # in front of the foundry,
574e.g. Mate#Morpho. Alternatively you can skip #ALL.
575Can be set multiple times.
576
577=item B<--allow|-a> <foundry>#<layer>
578
579Allow specific foundries and layers by defining them
580combining the foundry name with a # and the layer name.
581
582=item B<--primary|-p>
583
584Output primary data or not. Defaults to true.
585Can be flagged using --no-primary as well.
586
587=item B<--jobs|-j>
588
589Define the number of concurrent jobs in seperated forks
590for archive processing, defaults to 0. This is B<EXPERIMENTAL>!
591
592=item B<--human|-m>
593
594Represent the data human friendly, while the output defaults to JSON.
595
596=item B<--pretty|-y>
597
598Pretty print JSON output.
599
600=item B<--gzip|-z>
601
602Compress the output (expects a defined output file in single processing).
603
Akrone10ad322016-02-27 10:54:26 +0100604=item B<--sigle|-sg>
605
606Extract the given text sigles.
607Currently only supported on C<extract>.
608Can be set multiple times.
609
Akron941c1a62016-02-23 17:41:41 +0100610=item B<--log|-l>
611
612The L<Log4perl> log level, defaults to C<ERROR>.
613
614=item B<--help|-h>
615
616Print this document.
617
618=item B<--version|-v>
619
620Print version information.
621
622=back
623
624=head1 AVAILABILITY
625
626 https://github.com/KorAP/KorAP-XML-Krill
627
628
629=head1 COPYRIGHT AND LICENSE
630
631Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
632Author: L<Nils Diewald|http://nils-diewald.de/>
633
634L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
635Corpus Analysis Platform at the
636L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
637member of the
638L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
639
640This program is free software published under the
641L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
642
643=cut