blob: 289c2f468348e2782bd54e7430906185b9d7334e [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akrona977e172016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akrona977e172016-02-23 17:41:41 +010011use Pod::Usage;
12use Directory::Iterator;
Akron9a04c712016-02-05 19:40:05 +010013use KorAP::XML::Krill;
Akrona977e172016-02-23 17:41:41 +010014use KorAP::XML::Archive;
Akron9a04c712016-02-05 19:40:05 +010015use KorAP::XML::Tokenizer;
Akrona977e172016-02-23 17:41:41 +010016use Parallel::ForkManager;
Akron9a04c712016-02-05 19:40:05 +010017
Akrona977e172016-02-23 17:41:41 +010018# CHANGES:
19# ----------------------------------------------------------
20# 2013/11/25
21# - Initial release
22#
23# 2014/10/29
24# - Merges foundry data to create indexer friendly documents
25#
Akron9a04c712016-02-05 19:40:05 +010026# 2016/02/04
27# - renamed to korapxml2krill
28# - added Schreibgebrauch support
Akron9078bb92016-02-12 19:09:06 +010029#
30# 2016/02/12
31# - fixed foundry skipping
Akrona977e172016-02-23 17:41:41 +010032# - Support overwrite in archive processing
Akrondba47712016-02-14 23:06:48 +010033#
34# 2016/02/14
35# - Added version information
Akrona977e172016-02-23 17:41:41 +010036# - Added support for archive files
37#
38# 2016/02/15
39# - Fixed temporary directory bug
40# - Improved skipping before unzipping
41# - Added EXPERIMENTAL concurrency support
42#
43# 2016/02/23
44# - Merge korapxml2krill and korapxml2krill_dir
Akron2978ddd2016-02-27 10:54:26 +010045#
46# 2016/02/27
47# - Added extract function
Akrona977e172016-02-23 17:41:41 +010048# ----------------------------------------------------------
Akron9078bb92016-02-12 19:09:06 +010049
Akron2978ddd2016-02-27 10:54:26 +010050our $LAST_CHANGE = '2016/02/27';
Akrona977e172016-02-23 17:41:41 +010051our $LOCAL = $FindBin::Bin;
52our $VERSION_MSG = <<"VERSION";
53Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
54VERSION
55
56
57# Parse comand
58my $cmd;
59our @ARGV;
60if ($ARGV[0] && index($ARGV[0], '-') != 0) {
61 $cmd = shift @ARGV;
Akrondba47712016-02-14 23:06:48 +010062};
Akron9a04c712016-02-05 19:40:05 +010063
Akron2978ddd2016-02-27 10:54:26 +010064my (@skip, @sigle);
65
Akrona977e172016-02-23 17:41:41 +010066# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000067GetOptions(
Akrona977e172016-02-23 17:41:41 +010068 'input|i=s' => \(my $input),
69 'output|o=s' => \(my $output),
70 'overwrite|w' => \(my $overwrite),
71 'human|m' => \(my $text),
72 'token|t=s' => \(my $token_base),
73 'gzip|z' => \(my $gzip),
Akron2978ddd2016-02-27 10:54:26 +010074 'skip|s=s' => \@skip,
75 'sigle|sg=s' => \@sigle,
Akrona977e172016-02-23 17:41:41 +010076 'log|l=s' => \(my $log_level = 'ERROR'),
77 'allow|a=s' => \(my @allow),
78 'primary|p!' => \(my $primary),
79 'pretty|y' => \(my $pretty),
80 'jobs|j=i' => \(my $jobs = 0),
81 'help|h' => sub {
82 pod2usage(
83 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
84 -verbose => 99,
85 -msg => $VERSION_MSG,
86 );
87 },
88 'version|v' => sub {
89 pod2usage(
90 -verbose => 0,
91 -msg => $VERSION_MSG
92 )
93 }
Nils Diewald7364d1f2013-11-05 19:26:35 +000094);
95
Akrona977e172016-02-23 17:41:41 +010096my %ERROR_HASH = (
97 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
98 -verbose => 99,
99 -msg => $VERSION_MSG,
100 -exit => 1
101);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000102
Akrona977e172016-02-23 17:41:41 +0100103# Input has to be defined
104pod2usage(%ERROR_HASH) unless $input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000105
Nils Diewald7364d1f2013-11-05 19:26:35 +0000106
Akrona977e172016-02-23 17:41:41 +0100107# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000108Log::Log4perl->init({
109 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
110 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
111 'log4perl.appender.STDERR.layout' => 'PatternLayout',
112 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
113});
114
115my $log = Log::Log4perl->get_logger('main');
116
Akrona977e172016-02-23 17:41:41 +0100117
118# Get file name based on path information
119sub get_file_name ($) {
120 my $file = shift;
121 $file =~ s/^?\/?$input//;
122 $file =~ tr/\//-/;
123 $file =~ s{^-+}{};
124 return $file;
Nils Diewald5b4865f2014-11-05 18:20:50 +0000125};
126
Akrona977e172016-02-23 17:41:41 +0100127
128# Write file
129sub write_file {
130 my $anno = shift;
131 my $file = get_file_name $anno;
132
133 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
134
135 my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
136 $anno . ' -o ' . $output . '/' . $file . '.json';
137 $call .= '.gz -z' if $gzip;
138 $call .= ' -m' if $text;
139 $call .= ' -w' if $overwrite;
140 $call .= ' -t ' . $token_base if $token_base;
141 $call .= ' -l ' . $log_level if $log_level;
142 $call .= ' --no-primary ' if $primary;
143 $call .= ' -y ' . $pretty if $pretty;
144 $call .= ' -a ' . $_ foreach @allow;
145 $call .= ' -s ' . $_ foreach @skip;
146 system($call);
147 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000148};
149
Nils Diewald2db9ad02013-10-29 19:26:43 +0000150
Akron2978ddd2016-02-27 10:54:26 +0100151# Convert sigle to path construct
152s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
153
Akrona977e172016-02-23 17:41:41 +0100154# Process a single file
155unless ($cmd) {
Nils Diewald5b4865f2014-11-05 18:20:50 +0000156
Akrona977e172016-02-23 17:41:41 +0100157 # Can't print gzip to STDOUT
158 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000159
Akrona977e172016-02-23 17:41:41 +0100160 my %skip;
161 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000162
163
Akrona977e172016-02-23 17:41:41 +0100164 # Ignore processing
165 if (!$overwrite && $output && -e $output) {
166 $log->trace($output . ' already exists');
167 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000168 };
Akrona977e172016-02-23 17:41:41 +0100169
170 BEGIN {
171 $main::TIME = Benchmark->new;
172 $main::LAST_STOP = Benchmark->new;
173 };
174
175 sub stop_time {
176 my $new = Benchmark->new;
177 $log->trace(
178 'The code took: '.
179 timestr(timediff($new, $main::LAST_STOP)) .
180 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
181 );
182 $main::LAST_STOP = $new;
183 };
184
185 # Create and parse new document
186 $input =~ s{([^/])$}{$1/};
187 my $doc = KorAP::XML::Krill->new( path => $input );
188
189 unless ($doc->parse) {
190 $log->warn($output . " can't be processed - no document data");
191 exit(0);
192 };
193
194 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
195 if ($token_base) {
196 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
197 };
198
199 # Get tokenization
200 my $tokens = KorAP::XML::Tokenizer->new(
201 path => $doc->path,
202 doc => $doc,
203 foundry => $token_base_foundry,
204 layer => $token_base_layer,
205 name => 'tokens'
206 );
207
208 # Unable to process base tokenization
209 unless ($tokens->parse) {
210 $log->error($output . " can't be processed - no base tokenization");
211 exit(0);
212 };
213
214 my @layers;
215 push(@layers, ['Base', 'Sentences']);
216 push(@layers, ['Base', 'Paragraphs']);
217
218 # Connexor
219 push(@layers, ['Connexor', 'Morpho']);
220 push(@layers, ['Connexor', 'Syntax']);
221 push(@layers, ['Connexor', 'Phrase']);
222 push(@layers, ['Connexor', 'Sentences']);
223
224 # CoreNLP
225 push(@layers, ['CoreNLP', 'NamedEntities']);
226 push(@layers, ['CoreNLP', 'Sentences']);
227 push(@layers, ['CoreNLP', 'Morpho']);
228 push(@layers, ['CoreNLP', 'Constituency']);
229
230 # DeReKo
231 push(@layers, ['DeReKo', 'Structure']);
232
233 # Glemm
234 push(@layers, ['Glemm', 'Morpho']);
235
236 # Malt
237 # push(@layers, ['Malt', 'Dependency']);
238
239 # Mate
240 push(@layers, ['Mate', 'Morpho']);
241 push(@layers, ['Mate', 'Dependency']);
242
243 # OpenNLP
244 push(@layers, ['OpenNLP', 'Morpho']);
245 push(@layers, ['OpenNLP', 'Sentences']);
246
247 # Schreibgebrauch
248 push(@layers, ['Sgbr', 'Lemma']);
249 push(@layers, ['Sgbr', 'Morpho']);
250
251 # TreeTagger
252 push(@layers, ['TreeTagger', 'Morpho']);
253 push(@layers, ['TreeTagger', 'Sentences']);
254
255 # XIP
256 push(@layers, ['XIP', 'Morpho']);
257 push(@layers, ['XIP', 'Constituency']);
258 push(@layers, ['XIP', 'Sentences']);
259 push(@layers, ['XIP', 'Dependency']);
260
261
262 if ($skip{'#all'}) {
263 foreach (@allow) {
264 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000265 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000266 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000267 }
268 else {
Akrona977e172016-02-23 17:41:41 +0100269 # Add to index file - respect skipping
270 foreach my $info (@layers) {
271 # Skip if Foundry or Foundry#Layer should be skipped
272 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
273 $tokens->add(@$info);
274 stop_time;
275 };
276 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000277 };
278
Akrona977e172016-02-23 17:41:41 +0100279 my $file;
280
281 my $print_text = $text ? $tokens->to_string($primary) :
282 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
283
284 if ($output) {
285
286 if ($gzip) {
287 $file = IO::Compress::Gzip->new($output, Minimal => 1);
288 }
289 else {
290 $file = IO::File->new($output, "w");
291 };
292
293 $file->print($print_text);
294 $file->close;
295 }
296
297 else {
298 print $print_text . "\n";
299 };
300
301 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000302}
Nils Diewald5b4865f2014-11-05 18:20:50 +0000303
Akron2978ddd2016-02-27 10:54:26 +0100304# Extract XML files
305elsif ($cmd eq 'extract') {
306
307 pod2usage(%ERROR_HASH) unless $output;
308
309 # TODO: Support sigles and full archives
310
311 if ($output && (!-e $output || !-d $output)) {
312 print "Directory '$output' does not exist.\n\n";
313 exit(0);
314 };
315
316 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
317
318 unless ($archive->test_unzip) {
319 print "Unzip is not installed or incompatible.\n\n";
320 exit(1);
321 };
322
323 # Test will be skipped
324
325 # Iterate over all given sigles and extract
326 foreach (@sigle) {
327 print "$_ ";
328 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
329 print "extracted.\n";
330 };
331
332 print "\n";
333 exit(1);
334 };
335}
336
Akrona977e172016-02-23 17:41:41 +0100337# Process an archive
338elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000339
Akron2978ddd2016-02-27 10:54:26 +0100340 # TODO: Support sigles
341
Akrona977e172016-02-23 17:41:41 +0100342 pod2usage(%ERROR_HASH) unless $output;
343
344 if ($output && (!-e $output || !-d $output)) {
345 print "Directory '$output' does not exist.\n\n";
346 exit(0);
347 };
348
349 # Zero means: everything runs in the parent process
350 my $pool = Parallel::ForkManager->new($jobs);
351
352 my $count = 0; # Texts to process
353 my $iter = 1; # Current text in process
354
355 # Report on fork message
356 $pool->run_on_finish (
357 sub {
358 my ($pid, $code) = shift;
359 my $data = pop;
360 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
361 ($iter++) . "/$count]" .
362 ($code ? " $code" : '') .
363 " $$data\n";
364 }
365 );
366
367 my $t;
368 print "Reading data ...\n";
369
370 # Input is a directory
371 if (-d $input) {
372 my $it = Directory::Iterator->new($input);
373 my @dirs;
374 my $dir;
375
376 while (1) {
377 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
378 push @dirs, $dir;
379 $it->prune;
380 };
381 last unless $it->next;
382 };
383
384 print "Start processing ...\n";
385 $t = Benchmark->new;
386 $count = scalar @dirs;
387
388 DIRECTORY_LOOP:
389 for (my $i = 0; $i < $count; $i++) {
390
391 unless ($overwrite) {
392 my $filename = catfile(
393 $output,
394 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
395 );
396
397 if (-e $filename) {
398 $iter++;
399 print "Skip $filename\n";
400 next;
401 };
402 };
403
404 # Get the next fork
405 my $pid = $pool->start and next DIRECTORY_LOOP;
406 my $msg;
407
408 $msg = write_file($dirs[$i]);
409 $pool->finish(0, \$msg);
410 };
411 }
412
413 # Input is a file
414 elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
415 unless ($archive->test_unzip) {
416 print "Unzip is not installed or incompatible.\n\n";
417 exit(1);
418 };
419
420 unless ($archive->test) {
421 print "Zip archive not compatible.\n\n";
422 exit(1);
423 };
424
425 print "Start processing ...\n";
426 $t = Benchmark->new;
427 my @dirs = $archive->list_texts;
428 $count = scalar @dirs;
429
430 ARCHIVE_LOOP:
431 for (my $i = 0; $i < $count; $i++) {
432
433 # Split path information
434 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
435
436 unless ($overwrite) {
437 my $filename = catfile(
438 $output,
439 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
440 );
441
442 if (-e $filename) {
443 $iter++;
444 print "Skip $filename\n";
445 next;
446 };
447 };
448
449 # Get the next fork
450 my $pid = $pool->start and next ARCHIVE_LOOP;
451
452 # Create temporary file
453 my $temp = File::Temp->newdir;
454
455 my $msg;
456
457 # Extract from archive
458 if ($archive->extract($dirs[$i], $temp)) {
459
460 # Create corpus directory
461 $input = catdir("$temp", $corpus);
462
463 # Temporary directory
464 my $dir = catdir($input, $doc, $text);
465
466 # Write file
467 $msg = write_file($dir);
468
469 $temp = undef;
470 $pool->finish(0, \$msg);
471 }
472 else {
473
474 $temp = undef;
475 $msg = "Unable to extract " . $dirs[$i] . "\n";
476 $pool->finish(1, \$msg);
477 };
478 };
479 }
480
481 else {
482 print "Input is neither a directory nor an archive.\n\n";
483 };
484
485 $pool->wait_all_children;
486
487 print "Done.\n";
488 print timestr(timediff(Benchmark->new, $t))."\n\n";
489}
490
491# Unknown command
492else {
493 warn "Unknown command '$cmd'.\n\n";
494 pod2usage(%ERROR_HASH);
495}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000496
497__END__
Akrona977e172016-02-23 17:41:41 +0100498
499=pod
500
501=encoding utf8
502
503=head1 NAME
504
505korapxml2krill - Merge KorapXML data and create Krill friendly documents
506
507
508=head1 SYNOPSIS
509
510 $ korapxml2krill [archive] -z --input <directory> --output <filename>
511
512
513=head1 DESCRIPTION
514
515L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
516compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
517
518
519=head1 INSTALLATION
520
521The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
522
523 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
524
525In case everything went well, the C<korapxml2krill> command line tool will
526be available.
527
528
529=head1 ARGUMENTS
530
531=over 2
532
533=item B<archive>
534
Akron2978ddd2016-02-27 10:54:26 +0100535Process an archive as a Zip-file or a folder of KorAP-XML documents.
536
537=item B<extract>
538
539Extract KorAP-XML files from a Zip-file.
Akrona977e172016-02-23 17:41:41 +0100540
541=back
542
543
544=head1 OPTIONS
545
546=over 2
547
548=item B<--input|-i> <directory|file>
549
550Directory or archive file of documents to index.
551
552=item B<--output|-o> <directory|file>
553
554Output folder for archive processing or
555document name for single output (optional),
556writes to <STDOUT> by default.
557
558=item B<--overwrite|-w>
559
560Overwrite files that already exist.
561
562=item B<--token|-t> <foundry>[#<file>]
563
564Define the default tokenization by specifying
565the name of the foundry and optionally the name
566of the layer-file. Defaults to OpenNLP#tokens.
567
568=item B<--skip|-s> <foundry>[#<layer>]
569
570Skip specific foundries by specifying the name
571or specific layers by defining the name
572with a # in front of the foundry,
573e.g. Mate#Morpho. Alternatively you can skip #ALL.
574Can be set multiple times.
575
576=item B<--allow|-a> <foundry>#<layer>
577
578Allow specific foundries and layers by defining them
579combining the foundry name with a # and the layer name.
580
581=item B<--primary|-p>
582
583Output primary data or not. Defaults to true.
584Can be flagged using --no-primary as well.
585
586=item B<--jobs|-j>
587
588Define the number of concurrent jobs in seperated forks
589for archive processing, defaults to 0. This is B<EXPERIMENTAL>!
590
591=item B<--human|-m>
592
593Represent the data human friendly, while the output defaults to JSON.
594
595=item B<--pretty|-y>
596
597Pretty print JSON output.
598
599=item B<--gzip|-z>
600
601Compress the output (expects a defined output file in single processing).
602
Akron2978ddd2016-02-27 10:54:26 +0100603=item B<--sigle|-sg>
604
605Extract the given text sigles.
606Currently only supported on C<extract>.
607Can be set multiple times.
608
Akrona977e172016-02-23 17:41:41 +0100609=item B<--log|-l>
610
611The L<Log4perl> log level, defaults to C<ERROR>.
612
613=item B<--help|-h>
614
615Print this document.
616
617=item B<--version|-v>
618
619Print version information.
620
621=back
622
623=head1 AVAILABILITY
624
625 https://github.com/KorAP/KorAP-XML-Krill
626
627
628=head1 COPYRIGHT AND LICENSE
629
630Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
631Author: L<Nils Diewald|http://nils-diewald.de/>
632
633L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
634Corpus Analysis Platform at the
635L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
636member of the
637L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
638
639This program is free software published under the
640L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
641
642=cut