blob: b60eb5949c8b7ac5672868fd4dafda83f2ef0c9d [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron3741f8b2016-12-21 19:55:21 +010076# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020077# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020078#
Akron3741f8b2016-12-21 19:55:21 +010079# 2016/12/21
80# - added support for base-sentences and base-tokenizations
81#
Akron941c1a62016-02-23 17:41:41 +010082# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010083
Akron3741f8b2016-12-21 19:55:21 +010084our $LAST_CHANGE = '2016/12/21';
Akron941c1a62016-02-23 17:41:41 +010085our $LOCAL = $FindBin::Bin;
86our $VERSION_MSG = <<"VERSION";
87Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
88VERSION
89
Akron941c1a62016-02-23 17:41:41 +010090# Parse comand
91my $cmd;
92our @ARGV;
93if ($ARGV[0] && index($ARGV[0], '-') != 0) {
94 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010095};
Akron93d620e2016-02-05 19:40:05 +010096
Akron5f51d422016-08-16 16:26:43 +020097my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010098my $text;
Akrone10ad322016-02-27 10:54:26 +010099
Akron941c1a62016-02-23 17:41:41 +0100100# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000101GetOptions(
Akron08385f62016-03-22 20:37:04 +0100102 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100103 'output|o=s' => \(my $output),
104 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100105 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200106 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron3741f8b2016-12-21 19:55:21 +0100107 'base-sentences|bs=s' => \(my $base_sentences = ''),
108 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
Akron941c1a62016-02-23 17:41:41 +0100109 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100110 'skip|s=s' => \@skip,
111 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100112 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100113 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200114 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100115 'primary|p!' => \(my $primary),
116 'pretty|y' => \(my $pretty),
117 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200118 'cache-size|cs=s' => \(my $cache_size = '50m'),
119 'cache-delete|cd!' => \(my $cache_delete = 1),
120 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100121 'help|h' => sub {
122 pod2usage(
123 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200124 -verbose => 99,
125 -msg => $VERSION_MSG,
126 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100127 );
128 },
129 'version|v' => sub {
130 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200131 -verbose => 0,
132 -msg => $VERSION_MSG,
133 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100134 )
135 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000136);
137
Akron3741f8b2016-12-21 19:55:21 +0100138$base_sentences = lc $base_sentences;
139$base_paragraphs = lc $base_paragraphs;
140
Akron941c1a62016-02-23 17:41:41 +0100141my %ERROR_HASH = (
142 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200143 -verbose => 99,
144 -msg => $VERSION_MSG,
145 -output => '-',
146 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100147);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000148
Akron941c1a62016-02-23 17:41:41 +0100149# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100150pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000151
Akrone1dbc382016-07-08 22:24:52 +0200152# Gzip has no effect, if no output is given
153pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000154
Akron941c1a62016-02-23 17:41:41 +0100155# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000156Log::Log4perl->init({
157 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
158 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
159 'log4perl.appender.STDERR.layout' => 'PatternLayout',
160 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
161});
162
163my $log = Log::Log4perl->get_logger('main');
164
Akrone1dbc382016-07-08 22:24:52 +0200165my %skip;
166$skip{lc($_)} = 1 foreach @skip;
167
168my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100169push(@layers, ['Base', 'Sentences']) unless $base_sentences;
170push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200171
172# Connexor
173push(@layers, ['Connexor', 'Morpho']);
174push(@layers, ['Connexor', 'Syntax']);
175push(@layers, ['Connexor', 'Phrase']);
176push(@layers, ['Connexor', 'Sentences']);
177
178# CoreNLP
179push(@layers, ['CoreNLP', 'NamedEntities']);
180push(@layers, ['CoreNLP', 'Sentences']);
181push(@layers, ['CoreNLP', 'Morpho']);
182push(@layers, ['CoreNLP', 'Constituency']);
183
Akron3741f8b2016-12-21 19:55:21 +0100184
Akrone1dbc382016-07-08 22:24:52 +0200185# DeReKo
Akron3741f8b2016-12-21 19:55:21 +0100186if ($base_sentences eq 'dereko#structure' && $base_paragraphs eq 'dereko#structure') {
187 push(@layers, ['DeReKo', 'Structure', 'base-sentences-paragraphs']);
188}
189elsif ($base_sentences eq 'dereko#structure') {
190 push(@layers, ['DeReKo', 'Structure', 'base-sentences']);
191}
192elsif ($base_paragraphs eq 'dereko#structure') {
193 push(@layers, ['DeReKo', 'Structure', 'base-paragraphs']);
194}
195else {
196 push(@layers, ['DeReKo', 'Structure']);
197};
Akrone1dbc382016-07-08 22:24:52 +0200198
199# Glemm
200push(@layers, ['Glemm', 'Morpho']);
201
202# Malt
203push(@layers, ['Malt', 'Dependency']);
204
205# MDParser
206push(@layers, ['MDParser', 'Dependency']);
207
208# Mate
209push(@layers, ['Mate', 'Morpho']);
210push(@layers, ['Mate', 'Dependency']);
211
212# OpenNLP
213push(@layers, ['OpenNLP', 'Morpho']);
214push(@layers, ['OpenNLP', 'Sentences']);
215
216# Schreibgebrauch
217push(@layers, ['Sgbr', 'Lemma']);
218push(@layers, ['Sgbr', 'Morpho']);
219
220# TreeTagger
221push(@layers, ['TreeTagger', 'Morpho']);
222push(@layers, ['TreeTagger', 'Sentences']);
223
224# XIP
225push(@layers, ['XIP', 'Morpho']);
226push(@layers, ['XIP', 'Constituency']);
227push(@layers, ['XIP', 'Sentences']);
228push(@layers, ['XIP', 'Dependency']);
229
230# Check filters
231my @filtered_anno;
232if ($skip{'#all'}) {
233 foreach (@anno) {
234 push @filtered_anno, [ split('#', $_) ];
235 };
236}
237
238# Add all annotations that are not skipped
239else {
240 # Add to index file - respect skipping
241 foreach my $info (@layers) {
242 # Skip if Foundry or Foundry#Layer should be skipped
243 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
244 push @filtered_anno, $info;
245 };
246 };
247};
248
249# Get tokenization basis
250my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
251
252# TODO: This should not be initialized for batch
253my $cache = Cache::FastMmap->new(
254 share_file => $cache_file,
255 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200256 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200257);
258
Akron03b24db2016-08-16 20:54:32 +0200259# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200260my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200261 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200262 meta_type => $meta,
263 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200264 foundry => $token_base_foundry,
265 layer => $token_base_layer,
266 gzip => $gzip,
267 log => $log,
268 primary => $primary,
269 pretty => $pretty,
270 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200271);
272
Akron941c1a62016-02-23 17:41:41 +0100273
274# Get file name based on path information
275sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100276 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200277 if (-d $i) {
278 $i =~ s![^\/]+$!!;
279 };
Akron941c1a62016-02-23 17:41:41 +0100280 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200281
282 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200283 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100284 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100285 $file =~ tr/\//-/;
286 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200287 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100288 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000289};
290
Akrone10ad322016-02-27 10:54:26 +0100291# Convert sigle to path construct
292s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
293
Akron7d4cdd82016-08-17 21:39:45 +0200294if ($cmd) {
295 if ($output && (!-e $output || !-d $output)) {
296 print "Directory '$output' does not exist.\n\n";
297 exit(0);
298 };
299};
300
301
Akron941c1a62016-02-23 17:41:41 +0100302# Process a single file
303unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100304 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000305
Akron941c1a62016-02-23 17:41:41 +0100306 BEGIN {
307 $main::TIME = Benchmark->new;
308 $main::LAST_STOP = Benchmark->new;
309 };
310
311 sub stop_time {
312 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200313 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100314 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200315 timestr(timediff($new, $main::LAST_STOP)) .
316 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
317 );
Akron941c1a62016-02-23 17:41:41 +0100318 $main::LAST_STOP = $new;
319 };
320
321 # Create and parse new document
322 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100323
Akron7d4cdd82016-08-17 21:39:45 +0200324 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200325 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100326
Akron11c80302016-03-18 19:44:43 +0100327 # Delete cache file
328 unlink($cache_file) if $cache_delete;
329
Akron5f51d422016-08-16 16:26:43 +0200330 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000331}
Nils Diewald59094f22014-11-05 18:20:50 +0000332
Akrone10ad322016-02-27 10:54:26 +0100333# Extract XML files
334elsif ($cmd eq 'extract') {
335
Akron7d4cdd82016-08-17 21:39:45 +0200336 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200337 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100338
Akron7d4cdd82016-08-17 21:39:45 +0200339 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100340 unless ($archive->test_unzip) {
341 print "Unzip is not installed or incompatible.\n\n";
342 exit(1);
343 };
344
Akronb0c88db2016-06-29 16:33:18 +0200345 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200346 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200347
Akron651cb8d2016-08-16 21:44:49 +0200348 my $prefix = 1;
349
Akron03b24db2016-08-16 20:54:32 +0200350 # No sigles given
351 unless (@sigle) {
352
353 # Get files
354 foreach ($archive->list_texts) {
355
356 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200357 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200358
359 # TODO: Make this OS independent
360 push @sigle, join '/', $corpus, $doc, $text;
361 };
Akron20807582016-10-26 17:11:34 +0200362 }
363
364 # Check sigle for doc sigles
365 else {
366 my @new_sigle;
367
368 my $prefix_check = 0;
369
370 # Iterate over all sigle
371 foreach (@sigle) {
372
373 # Sigle is a doc sigle
374 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200375
Akron2812ba22016-10-28 21:55:59 +0200376 print "$_ ...\n";
Akron20807582016-10-26 17:11:34 +0200377 # Check if a prefix is needed
378 unless ($prefix_check) {
379 $prefix = $archive->check_prefix;
380 $prefix_check = 1;
381 };
382
383 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200384 my $path = ($prefix ? './' : '') . $_;
385
386 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200387 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200388 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200389 ) ? '' : 'not '
390 );
391 print "extracted.\n";
392 }
393 else {
394 push @new_sigle, $_;
395 };
396 };
397 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200398 };
399
Akrone10ad322016-02-27 10:54:26 +0100400 # Iterate over all given sigles and extract
401 foreach (@sigle) {
Akron2812ba22016-10-28 21:55:59 +0200402 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200403
Akron03b24db2016-08-16 20:54:32 +0200404 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200405 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200406 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200407 ($prefix ? './' : '') . $_, $output
408 ) ? '' : 'not '
409 );
Akrone10ad322016-02-27 10:54:26 +0100410 print "extracted.\n";
411 };
412
413 print "\n";
414 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200415 }
Akron7d4cdd82016-08-17 21:39:45 +0200416
417 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200418 else {
419 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100420 };
421}
422
Akron941c1a62016-02-23 17:41:41 +0100423# Process an archive
424elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000425
Akrone1dbc382016-07-08 22:24:52 +0200426 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100427
Akron7d4cdd82016-08-17 21:39:45 +0200428 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100429 my $pool = Parallel::ForkManager->new($jobs);
430
Akron7d4cdd82016-08-17 21:39:45 +0200431 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100432 my $iter = 1; # Current text in process
433
434 # Report on fork message
435 $pool->run_on_finish (
436 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200437 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100438 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200439
Akron08385f62016-03-22 20:37:04 +0100440 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200441 ($iter++) . "/$count]" .
442 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200443 ' ' . $data->[0] . "\n";
444 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100445 }
446 );
447
448 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200449 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100450 print "Reading data ...\n";
451
Akron7d4cdd82016-08-17 21:39:45 +0200452 # unless (Cache::FastMmap->new(
453 # share_file => $cache_file,
454 # cache_size => $cache_size,
455 # init_file => $cache_init
456 # )) {
457 # print "Unable to intialize cache '$cache_file'\n\n";
458 # exit(1);
459 # };
Akron11c80302016-03-18 19:44:43 +0100460
Akron941c1a62016-02-23 17:41:41 +0100461 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100462 if (-d $input[0]) {
463 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100464 my @dirs;
465 my $dir;
466
Akron7d4cdd82016-08-17 21:39:45 +0200467 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100468 while (1) {
469 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200470 push @dirs, $dir;
471 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100472 };
473 last unless $it->next;
474 };
475
476 print "Start processing ...\n";
477 $t = Benchmark->new;
478 $count = scalar @dirs;
479
480 DIRECTORY_LOOP:
481 for (my $i = 0; $i < $count; $i++) {
482
Akrone1dbc382016-07-08 22:24:52 +0200483 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200484 $output,
485 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200486 );
Akron941c1a62016-02-23 17:41:41 +0100487
488 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200489 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200490
Akron13d56622016-10-31 14:54:49 +0100491 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
492 $pool->finish(
493 0,
494 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
495 );
Akron3ec48972016-08-17 23:24:52 +0200496 }
497 else {
Akron4c0cf312016-10-15 16:42:09 +0200498 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200499 };
Akron941c1a62016-02-23 17:41:41 +0100500 };
501 }
502
503 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200504 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200505
Akron941c1a62016-02-23 17:41:41 +0100506 unless ($archive->test_unzip) {
507 print "Unzip is not installed or incompatible.\n\n";
508 exit(1);
509 };
510
Akron08385f62016-03-22 20:37:04 +0100511 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200512 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100513
Akron941c1a62016-02-23 17:41:41 +0100514 print "Start processing ...\n";
515 $t = Benchmark->new;
516 my @dirs = $archive->list_texts;
517 $count = scalar @dirs;
518
519 ARCHIVE_LOOP:
520 for (my $i = 0; $i < $count; $i++) {
521
522 # Split path information
523 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
524
Akrone1dbc382016-07-08 22:24:52 +0200525 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200526 $output,
527 get_file_name(
528 catfile($corpus, $doc, $text)
529 . '.json' . ($gzip ? '.gz' : '')
530 )
Akrone1dbc382016-07-08 22:24:52 +0200531 );
Akron941c1a62016-02-23 17:41:41 +0100532
533 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200534 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100535
Akron4c0cf312016-10-15 16:42:09 +0200536 # Create temporary file
537 $temp = File::Temp->newdir;
538
Akronbdf434a2016-10-24 17:42:07 +0200539 # TODO: Check if $filename exist at the beginning,
540 # because extraction can be horrible slow!
541
Akron941c1a62016-02-23 17:41:41 +0100542 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200543 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100544
Akron7d4cdd82016-08-17 21:39:45 +0200545 # Create corpus directory
546 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100547
Akron7d4cdd82016-08-17 21:39:45 +0200548 # Temporary directory
549 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100550
Akron7d4cdd82016-08-17 21:39:45 +0200551 # Write file
Akron13d56622016-10-31 14:54:49 +0100552 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200553 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100554 $pool->finish(
555 0,
556 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
557 );
558 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200559 }
560 else {
Akron4c0cf312016-10-15 16:42:09 +0200561 # Delete temporary file
562 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200563 };
Akron941c1a62016-02-23 17:41:41 +0100564 }
Akron7d4cdd82016-08-17 21:39:45 +0200565
566 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100567 else {
Akron4c0cf312016-10-15 16:42:09 +0200568 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100569 };
570 };
571 }
572
573 else {
574 print "Input is neither a directory nor an archive.\n\n";
575 };
576
577 $pool->wait_all_children;
578
Akron11c80302016-03-18 19:44:43 +0100579 # Delete cache file
580 unlink($cache_file) if $cache_delete;
581
Akron941c1a62016-02-23 17:41:41 +0100582 print "Done.\n";
583 print timestr(timediff(Benchmark->new, $t))."\n\n";
584}
585
586# Unknown command
587else {
588 warn "Unknown command '$cmd'.\n\n";
589 pod2usage(%ERROR_HASH);
590}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000591
592__END__
Akron941c1a62016-02-23 17:41:41 +0100593
594=pod
595
596=encoding utf8
597
598=head1 NAME
599
Akronf7ad89e2016-03-16 18:22:47 +0100600korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100601
602
603=head1 SYNOPSIS
604
Akrona76d8352016-10-27 16:27:32 +0200605 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100606
Akron2fd402b2016-10-27 21:26:48 +0200607
Akron941c1a62016-02-23 17:41:41 +0100608=head1 DESCRIPTION
609
610L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
611compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100612The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100613
614
615=head1 INSTALLATION
616
617The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
618
Akronaf386982016-10-12 00:33:25 +0200619 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100620
Akronc13a1702016-03-15 19:33:14 +0100621In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100622be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200623Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200624In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100625
626=head1 ARGUMENTS
627
Akrona76d8352016-10-27 16:27:32 +0200628 $ korapxml2krill -z --input <directory> --output <filename>
629
630Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200631It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200632
Akron941c1a62016-02-23 17:41:41 +0100633=over 2
634
635=item B<archive>
636
Akrona76d8352016-10-27 16:27:32 +0200637 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
638
Akron2fd402b2016-10-27 21:26:48 +0200639Converts an archive of KorAP-XML documents. It expects a directory
640(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100641
642=item B<extract>
643
Akrona76d8352016-10-27 16:27:32 +0200644 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
645
646Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100647
648=back
649
650
651=head1 OPTIONS
652
653=over 2
654
Akrona76d8352016-10-27 16:27:32 +0200655=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100656
Akrona76d8352016-10-27 16:27:32 +0200657Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100658
Akron7606afa2016-10-25 16:23:49 +0200659Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100660document, while C<archive> expects a KorAP-XML corpus folder or a zip
661file to batch process multiple files.
662C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200663
Akrona76d8352016-10-27 16:27:32 +0200664C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200665that the first archive listed contains all primary data files
666and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200667
Akron7606afa2016-10-25 16:23:49 +0200668 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200669
Akron0c3e3752016-06-28 15:55:53 +0200670(The directory structure follows the base directory format,
671that may include a C<.> root folder.
672In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200673need to be passed with a hash sign in front of the archive's name.
674This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200675
Akron7606afa2016-10-25 16:23:49 +0200676To support zip files, a version of C<unzip> needs to be installed that is
677compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200678
Akron7606afa2016-10-25 16:23:49 +0200679B<The root folder switch using the hash sign is experimental and
680may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200681
Akron941c1a62016-02-23 17:41:41 +0100682=item B<--output|-o> <directory|file>
683
684Output folder for archive processing or
685document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100686writes to C<STDOUT> by default
687(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100688
689=item B<--overwrite|-w>
690
691Overwrite files that already exist.
692
Akron3741f8b2016-12-21 19:55:21 +0100693=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100694
695Define the default tokenization by specifying
696the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100697of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100698
Akron3741f8b2016-12-21 19:55:21 +0100699
700=item B<--base-sentences|-bs> <foundry>#<layer>
701
702Define the layer for base sentences.
703If given, this will be used instead of using C<Base#Sentences>.
704Currently C<DeReKo#Structure> is the only additional layer supported.
705
706 Defaults to unset.
707
708
709=item B<--base-paragraphs|-bp> <foundry>#<layer>
710
711Define the layer for base paragraphs.
712If given, this will be used instead of using C<Base#Paragraphs>.
713Currently C<DeReKo#Structure> is the only additional layer supported.
714
715 Defaults to unset.
716
717
Akron941c1a62016-02-23 17:41:41 +0100718=item B<--skip|-s> <foundry>[#<layer>]
719
Akronf7ad89e2016-03-16 18:22:47 +0100720Skip specific annotations by specifying the foundry
721(and optionally the layer with a C<#>-prefix),
722e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100723Can be set multiple times.
724
Akronc13a1702016-03-15 19:33:14 +0100725=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100726
Akronf7ad89e2016-03-16 18:22:47 +0100727Convert specific annotations by specifying the foundry
728(and optionally the layer with a C<#>-prefix),
729e.g. C<Mate> or C<Mate#Morpho>.
730Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100731
732=item B<--primary|-p>
733
Akronc13a1702016-03-15 19:33:14 +0100734Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100735Can be flagged using C<--no-primary> as well.
736This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100737
738=item B<--jobs|-j>
739
740Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100741for archive processing.
Akron11c80302016-03-18 19:44:43 +0100742Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100743This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100744
Akron35db6e32016-03-17 22:42:22 +0100745=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100746
Akron35db6e32016-03-17 22:42:22 +0100747Define the metadata parser to use. Defaults to C<I5>.
748Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
749This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100750
751=item B<--pretty|-y>
752
Akronc13a1702016-03-15 19:33:14 +0100753Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100754This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100755
756=item B<--gzip|-z>
757
Akronf7ad89e2016-03-16 18:22:47 +0100758Compress the output.
759Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100760
Akron11c80302016-03-18 19:44:43 +0100761=item B<--cache|-c>
762
763File to mmap a cache (using L<Cache::FastMmap>).
764Defaults to C<korapxml2krill.cache> in the calling directory.
765
766=item B<--cache-size|-cs>
767
768Size of the cache. Defaults to C<50m>.
769
770=item B<--cache-init|-ci>
771
772Initialize cache file.
773Can be flagged using C<--no-cache-init> as well.
774Defaults to C<true>.
775
776=item B<--cache-delete|-cd>
777
778Delete cache file after processing.
779Can be flagged using C<--no-cache-delete> as well.
780Defaults to C<true>.
781
Akrone10ad322016-02-27 10:54:26 +0100782=item B<--sigle|-sg>
783
Akron20807582016-10-26 17:11:34 +0200784Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100785Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100786I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200787Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200788In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200789On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100790
Akron941c1a62016-02-23 17:41:41 +0100791=item B<--log|-l>
792
793The L<Log4perl> log level, defaults to C<ERROR>.
794
795=item B<--help|-h>
796
797Print this document.
798
799=item B<--version|-v>
800
801Print version information.
802
803=back
804
Akronc13a1702016-03-15 19:33:14 +0100805=head1 ANNOTATION SUPPORT
806
807L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
808developed in the KorAP project that are part of the KorAP preprocessing pipeline.
809The base foundry with paragraphs, sentences, and the text element are mandatory for
810L<Krill|https://github.com/KorAP/Krill>.
811
Akronf7ad89e2016-03-16 18:22:47 +0100812=over 2
Akronc13a1702016-03-15 19:33:14 +0100813
814=item B<Base>
815
816=over 4
817
Akronf7ad89e2016-03-16 18:22:47 +0100818=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100819
Akronf7ad89e2016-03-16 18:22:47 +0100820=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100821
822=back
823
824=item B<Connexor>
825
826=over 4
827
Akronf7ad89e2016-03-16 18:22:47 +0100828=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100829
Akronf7ad89e2016-03-16 18:22:47 +0100830=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100831
Akronf7ad89e2016-03-16 18:22:47 +0100832=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100833
Akronf7ad89e2016-03-16 18:22:47 +0100834=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100835
836=back
837
838=item B<CoreNLP>
839
840=over 4
841
Akronf7ad89e2016-03-16 18:22:47 +0100842=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100843
Akronf7ad89e2016-03-16 18:22:47 +0100844=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100845
Akronf7ad89e2016-03-16 18:22:47 +0100846=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100847
Akronf7ad89e2016-03-16 18:22:47 +0100848=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100849
850=back
851
852=item B<DeReKo>
853
854=over 4
855
Akronf7ad89e2016-03-16 18:22:47 +0100856=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100857
858=back
859
860=item B<Glemm>
861
862=over 4
863
Akronf7ad89e2016-03-16 18:22:47 +0100864=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100865
866=back
867
868=item B<Mate>
869
870=over 4
871
Akronf7ad89e2016-03-16 18:22:47 +0100872=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100873
Akronf7ad89e2016-03-16 18:22:47 +0100874=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100875
876=back
877
878=item B<OpenNLP>
879
880=over 4
881
Akronf7ad89e2016-03-16 18:22:47 +0100882=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100883
Akronf7ad89e2016-03-16 18:22:47 +0100884=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100885
886=back
887
888=item B<Sgbr>
889
890=over 4
891
Akronf7ad89e2016-03-16 18:22:47 +0100892=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100893
Akronf7ad89e2016-03-16 18:22:47 +0100894=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100895
896=back
897
898=item B<TreeTagger>
899
900=over 4
901
Akronf7ad89e2016-03-16 18:22:47 +0100902=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100903
Akronf7ad89e2016-03-16 18:22:47 +0100904=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100905
906=back
907
908=item B<XIP>
909
910=over 4
911
Akronf7ad89e2016-03-16 18:22:47 +0100912=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100913
Akronf7ad89e2016-03-16 18:22:47 +0100914=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100915
Akronf7ad89e2016-03-16 18:22:47 +0100916=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100917
918=back
919
920=back
921
922More importers are in preparation.
923New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
924See the built-in annotation importers as examples.
925
Akron941c1a62016-02-23 17:41:41 +0100926=head1 AVAILABILITY
927
928 https://github.com/KorAP/KorAP-XML-Krill
929
930
931=head1 COPYRIGHT AND LICENSE
932
Akron3ec0a1c2017-01-18 14:41:55 +0100933Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100934
Akron941c1a62016-02-23 17:41:41 +0100935Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200936Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100937
938L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
939Corpus Analysis Platform at the
940L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
941member of the
942L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
943
944This program is free software published under the
945L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
946
947=cut