blob: 1a418afc8fd6f5c72da67a4633150c091963d16e [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron2fd402b2016-10-27 21:26:48 +020076# 1016/10/27
77# - Added wildcard support for document extraction
Akron941c1a62016-02-23 17:41:41 +010078# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010079
Akron2fd402b2016-10-27 21:26:48 +020080our $LAST_CHANGE = '2016/10/27';
Akron941c1a62016-02-23 17:41:41 +010081our $LOCAL = $FindBin::Bin;
82our $VERSION_MSG = <<"VERSION";
83Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
84VERSION
85
Akron941c1a62016-02-23 17:41:41 +010086# Parse comand
87my $cmd;
88our @ARGV;
89if ($ARGV[0] && index($ARGV[0], '-') != 0) {
90 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010091};
Akron93d620e2016-02-05 19:40:05 +010092
Akron5f51d422016-08-16 16:26:43 +020093my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010094my $text;
Akrone10ad322016-02-27 10:54:26 +010095
Akron941c1a62016-02-23 17:41:41 +010096# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000097GetOptions(
Akron08385f62016-03-22 20:37:04 +010098 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010099 'output|o=s' => \(my $output),
100 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100101 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200102 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron941c1a62016-02-23 17:41:41 +0100103 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100104 'skip|s=s' => \@skip,
105 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100106 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100107 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200108 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100109 'primary|p!' => \(my $primary),
110 'pretty|y' => \(my $pretty),
111 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200112 'cache-size|cs=s' => \(my $cache_size = '50m'),
113 'cache-delete|cd!' => \(my $cache_delete = 1),
114 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100115 'help|h' => sub {
116 pod2usage(
117 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200118 -verbose => 99,
119 -msg => $VERSION_MSG,
120 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100121 );
122 },
123 'version|v' => sub {
124 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200125 -verbose => 0,
126 -msg => $VERSION_MSG,
127 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100128 )
129 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000130);
131
Akron941c1a62016-02-23 17:41:41 +0100132my %ERROR_HASH = (
133 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200134 -verbose => 99,
135 -msg => $VERSION_MSG,
136 -output => '-',
137 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100138);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000139
Akron941c1a62016-02-23 17:41:41 +0100140# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100141pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000142
Akrone1dbc382016-07-08 22:24:52 +0200143# Gzip has no effect, if no output is given
144pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000145
Akron941c1a62016-02-23 17:41:41 +0100146# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000147Log::Log4perl->init({
148 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
149 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
150 'log4perl.appender.STDERR.layout' => 'PatternLayout',
151 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
152});
153
154my $log = Log::Log4perl->get_logger('main');
155
Akrone1dbc382016-07-08 22:24:52 +0200156my %skip;
157$skip{lc($_)} = 1 foreach @skip;
158
159my @layers;
160push(@layers, ['Base', 'Sentences']);
161push(@layers, ['Base', 'Paragraphs']);
162
163# Connexor
164push(@layers, ['Connexor', 'Morpho']);
165push(@layers, ['Connexor', 'Syntax']);
166push(@layers, ['Connexor', 'Phrase']);
167push(@layers, ['Connexor', 'Sentences']);
168
169# CoreNLP
170push(@layers, ['CoreNLP', 'NamedEntities']);
171push(@layers, ['CoreNLP', 'Sentences']);
172push(@layers, ['CoreNLP', 'Morpho']);
173push(@layers, ['CoreNLP', 'Constituency']);
174
175# DeReKo
176push(@layers, ['DeReKo', 'Structure']);
177
178# Glemm
179push(@layers, ['Glemm', 'Morpho']);
180
181# Malt
182push(@layers, ['Malt', 'Dependency']);
183
184# MDParser
185push(@layers, ['MDParser', 'Dependency']);
186
187# Mate
188push(@layers, ['Mate', 'Morpho']);
189push(@layers, ['Mate', 'Dependency']);
190
191# OpenNLP
192push(@layers, ['OpenNLP', 'Morpho']);
193push(@layers, ['OpenNLP', 'Sentences']);
194
195# Schreibgebrauch
196push(@layers, ['Sgbr', 'Lemma']);
197push(@layers, ['Sgbr', 'Morpho']);
198
199# TreeTagger
200push(@layers, ['TreeTagger', 'Morpho']);
201push(@layers, ['TreeTagger', 'Sentences']);
202
203# XIP
204push(@layers, ['XIP', 'Morpho']);
205push(@layers, ['XIP', 'Constituency']);
206push(@layers, ['XIP', 'Sentences']);
207push(@layers, ['XIP', 'Dependency']);
208
209# Check filters
210my @filtered_anno;
211if ($skip{'#all'}) {
212 foreach (@anno) {
213 push @filtered_anno, [ split('#', $_) ];
214 };
215}
216
217# Add all annotations that are not skipped
218else {
219 # Add to index file - respect skipping
220 foreach my $info (@layers) {
221 # Skip if Foundry or Foundry#Layer should be skipped
222 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
223 push @filtered_anno, $info;
224 };
225 };
226};
227
228# Get tokenization basis
229my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
230
231# TODO: This should not be initialized for batch
232my $cache = Cache::FastMmap->new(
233 share_file => $cache_file,
234 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200235 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200236);
237
Akron03b24db2016-08-16 20:54:32 +0200238# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200239my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200240 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200241 meta_type => $meta,
242 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200243 foundry => $token_base_foundry,
244 layer => $token_base_layer,
245 gzip => $gzip,
246 log => $log,
247 primary => $primary,
248 pretty => $pretty,
249 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200250);
251
Akron941c1a62016-02-23 17:41:41 +0100252
253# Get file name based on path information
254sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100255 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200256 if (-d $i) {
257 $i =~ s![^\/]+$!!;
258 };
Akron941c1a62016-02-23 17:41:41 +0100259 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200260
261 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200262 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100263 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100264 $file =~ tr/\//-/;
265 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200266 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100267 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000268};
269
Akrone10ad322016-02-27 10:54:26 +0100270# Convert sigle to path construct
271s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
272
Akron7d4cdd82016-08-17 21:39:45 +0200273if ($cmd) {
274 if ($output && (!-e $output || !-d $output)) {
275 print "Directory '$output' does not exist.\n\n";
276 exit(0);
277 };
278};
279
280
Akron941c1a62016-02-23 17:41:41 +0100281# Process a single file
282unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100283 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000284
Akron941c1a62016-02-23 17:41:41 +0100285 BEGIN {
286 $main::TIME = Benchmark->new;
287 $main::LAST_STOP = Benchmark->new;
288 };
289
290 sub stop_time {
291 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200292 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100293 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200294 timestr(timediff($new, $main::LAST_STOP)) .
295 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
296 );
Akron941c1a62016-02-23 17:41:41 +0100297 $main::LAST_STOP = $new;
298 };
299
300 # Create and parse new document
301 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100302
Akron7d4cdd82016-08-17 21:39:45 +0200303 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200304 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100305
Akron11c80302016-03-18 19:44:43 +0100306 # Delete cache file
307 unlink($cache_file) if $cache_delete;
308
Akron5f51d422016-08-16 16:26:43 +0200309 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000310}
Nils Diewald59094f22014-11-05 18:20:50 +0000311
Akrone10ad322016-02-27 10:54:26 +0100312# Extract XML files
313elsif ($cmd eq 'extract') {
314
Akron7d4cdd82016-08-17 21:39:45 +0200315 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200316 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100317
Akron7d4cdd82016-08-17 21:39:45 +0200318 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100319 unless ($archive->test_unzip) {
320 print "Unzip is not installed or incompatible.\n\n";
321 exit(1);
322 };
323
Akronb0c88db2016-06-29 16:33:18 +0200324 # Add further annotation archived
325 $archive->attach($_) foreach @input;
326
Akron651cb8d2016-08-16 21:44:49 +0200327 my $prefix = 1;
328
Akron03b24db2016-08-16 20:54:32 +0200329 # No sigles given
330 unless (@sigle) {
331
332 # Get files
333 foreach ($archive->list_texts) {
334
335 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200336 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200337
338 # TODO: Make this OS independent
339 push @sigle, join '/', $corpus, $doc, $text;
340 };
Akron20807582016-10-26 17:11:34 +0200341 }
342
343 # Check sigle for doc sigles
344 else {
345 my @new_sigle;
346
347 my $prefix_check = 0;
348
349 # Iterate over all sigle
350 foreach (@sigle) {
351
352 # Sigle is a doc sigle
353 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
354 print "$_ ";
355
356 # Check if a prefix is needed
357 unless ($prefix_check) {
358 $prefix = $archive->check_prefix;
359 $prefix_check = 1;
360 };
361
362 # TODO: Make this OS independent
363 print '' . (
364 $archive->extract_doc(
365 ($prefix ? './' : '') . $_, $output
366 ) ? '' : 'not '
367 );
368 print "extracted.\n";
369 }
370 else {
371 push @new_sigle, $_;
372 };
373 };
374 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200375 };
376
Akrone10ad322016-02-27 10:54:26 +0100377 # Iterate over all given sigles and extract
378 foreach (@sigle) {
379 print "$_ ";
Akron7d4cdd82016-08-17 21:39:45 +0200380
Akron03b24db2016-08-16 20:54:32 +0200381 # TODO: Make this OS independent
Akron651cb8d2016-08-16 21:44:49 +0200382 print '' . (
Akron20807582016-10-26 17:11:34 +0200383 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200384 ($prefix ? './' : '') . $_, $output
385 ) ? '' : 'not '
386 );
Akrone10ad322016-02-27 10:54:26 +0100387 print "extracted.\n";
388 };
389
390 print "\n";
391 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200392 }
Akron7d4cdd82016-08-17 21:39:45 +0200393
394 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200395 else {
396 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100397 };
398}
399
Akron941c1a62016-02-23 17:41:41 +0100400# Process an archive
401elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000402
Akrone1dbc382016-07-08 22:24:52 +0200403 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100404
Akron7d4cdd82016-08-17 21:39:45 +0200405 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100406 my $pool = Parallel::ForkManager->new($jobs);
407
Akron7d4cdd82016-08-17 21:39:45 +0200408 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100409 my $iter = 1; # Current text in process
410
411 # Report on fork message
412 $pool->run_on_finish (
413 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200414 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100415 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200416
Akron08385f62016-03-22 20:37:04 +0100417 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200418 ($iter++) . "/$count]" .
419 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200420 ' ' . $data->[0] . "\n";
421 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100422 }
423 );
424
425 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200426 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100427 print "Reading data ...\n";
428
Akron7d4cdd82016-08-17 21:39:45 +0200429 # unless (Cache::FastMmap->new(
430 # share_file => $cache_file,
431 # cache_size => $cache_size,
432 # init_file => $cache_init
433 # )) {
434 # print "Unable to intialize cache '$cache_file'\n\n";
435 # exit(1);
436 # };
Akron11c80302016-03-18 19:44:43 +0100437
Akron941c1a62016-02-23 17:41:41 +0100438 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100439 if (-d $input[0]) {
440 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100441 my @dirs;
442 my $dir;
443
Akron7d4cdd82016-08-17 21:39:45 +0200444 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100445 while (1) {
446 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200447 push @dirs, $dir;
448 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100449 };
450 last unless $it->next;
451 };
452
453 print "Start processing ...\n";
454 $t = Benchmark->new;
455 $count = scalar @dirs;
456
457 DIRECTORY_LOOP:
458 for (my $i = 0; $i < $count; $i++) {
459
Akrone1dbc382016-07-08 22:24:52 +0200460 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200461 $output,
462 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200463 );
Akron941c1a62016-02-23 17:41:41 +0100464
465 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200466 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200467
468 if ($batch_file->process($dirs[$i] => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200469 $pool->finish(0, ["Processed " . $filename]);
Akron3ec48972016-08-17 23:24:52 +0200470 }
471 else {
Akron4c0cf312016-10-15 16:42:09 +0200472 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200473 };
Akron941c1a62016-02-23 17:41:41 +0100474 };
475 }
476
477 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200478 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200479
Akron941c1a62016-02-23 17:41:41 +0100480 unless ($archive->test_unzip) {
481 print "Unzip is not installed or incompatible.\n\n";
482 exit(1);
483 };
484
Akron08385f62016-03-22 20:37:04 +0100485 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200486 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100487
Akron941c1a62016-02-23 17:41:41 +0100488 print "Start processing ...\n";
489 $t = Benchmark->new;
490 my @dirs = $archive->list_texts;
491 $count = scalar @dirs;
492
493 ARCHIVE_LOOP:
494 for (my $i = 0; $i < $count; $i++) {
495
496 # Split path information
497 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
498
Akrone1dbc382016-07-08 22:24:52 +0200499 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200500 $output,
501 get_file_name(
502 catfile($corpus, $doc, $text)
503 . '.json' . ($gzip ? '.gz' : '')
504 )
Akrone1dbc382016-07-08 22:24:52 +0200505 );
Akron941c1a62016-02-23 17:41:41 +0100506
507 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200508 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100509
Akron4c0cf312016-10-15 16:42:09 +0200510 # Create temporary file
511 $temp = File::Temp->newdir;
512
Akronbdf434a2016-10-24 17:42:07 +0200513 # TODO: Check if $filename exist at the beginning,
514 # because extraction can be horrible slow!
515
Akron941c1a62016-02-23 17:41:41 +0100516 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200517 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100518
Akron7d4cdd82016-08-17 21:39:45 +0200519 # Create corpus directory
520 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100521
Akron7d4cdd82016-08-17 21:39:45 +0200522 # Temporary directory
523 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100524
Akron7d4cdd82016-08-17 21:39:45 +0200525 # Write file
526 if ($batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200527 # Delete temporary file
528 $pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200529 }
530 else {
Akron4c0cf312016-10-15 16:42:09 +0200531 # Delete temporary file
532 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200533 };
Akron941c1a62016-02-23 17:41:41 +0100534 }
Akron7d4cdd82016-08-17 21:39:45 +0200535
536 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100537 else {
Akron4c0cf312016-10-15 16:42:09 +0200538 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100539 };
540 };
541 }
542
543 else {
544 print "Input is neither a directory nor an archive.\n\n";
545 };
546
547 $pool->wait_all_children;
548
Akron11c80302016-03-18 19:44:43 +0100549 # Delete cache file
550 unlink($cache_file) if $cache_delete;
551
Akron941c1a62016-02-23 17:41:41 +0100552 print "Done.\n";
553 print timestr(timediff(Benchmark->new, $t))."\n\n";
554}
555
556# Unknown command
557else {
558 warn "Unknown command '$cmd'.\n\n";
559 pod2usage(%ERROR_HASH);
560}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000561
562__END__
Akron941c1a62016-02-23 17:41:41 +0100563
564=pod
565
566=encoding utf8
567
568=head1 NAME
569
Akronf7ad89e2016-03-16 18:22:47 +0100570korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100571
572
573=head1 SYNOPSIS
574
Akrona76d8352016-10-27 16:27:32 +0200575 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100576
Akron2fd402b2016-10-27 21:26:48 +0200577
Akron941c1a62016-02-23 17:41:41 +0100578=head1 DESCRIPTION
579
580L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
581compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100582The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100583
584
585=head1 INSTALLATION
586
587The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
588
Akronaf386982016-10-12 00:33:25 +0200589 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100590
Akronc13a1702016-03-15 19:33:14 +0100591In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100592be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200593Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200594In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100595
596=head1 ARGUMENTS
597
Akrona76d8352016-10-27 16:27:32 +0200598 $ korapxml2krill -z --input <directory> --output <filename>
599
600Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200601It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200602
Akron941c1a62016-02-23 17:41:41 +0100603=over 2
604
605=item B<archive>
606
Akrona76d8352016-10-27 16:27:32 +0200607 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
608
Akron2fd402b2016-10-27 21:26:48 +0200609Converts an archive of KorAP-XML documents. It expects a directory
610(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100611
612=item B<extract>
613
Akrona76d8352016-10-27 16:27:32 +0200614 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
615
616Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100617
618=back
619
620
621=head1 OPTIONS
622
623=over 2
624
Akrona76d8352016-10-27 16:27:32 +0200625=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100626
Akrona76d8352016-10-27 16:27:32 +0200627Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100628
Akron7606afa2016-10-25 16:23:49 +0200629Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akrona76d8352016-10-27 16:27:32 +0200630document, while C<archive> and C<extract> support zip files as well.
Akron7606afa2016-10-25 16:23:49 +0200631
Akrona76d8352016-10-27 16:27:32 +0200632C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200633that the first archive listed contains all primary data files
634and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200635
Akron7606afa2016-10-25 16:23:49 +0200636 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200637
Akron0c3e3752016-06-28 15:55:53 +0200638(The directory structure follows the base directory format,
639that may include a C<.> root folder.
640In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200641need to be passed with a hash sign in front of the archive's name.
642This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200643
Akron7606afa2016-10-25 16:23:49 +0200644To support zip files, a version of C<unzip> needs to be installed that is
645compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200646
Akron7606afa2016-10-25 16:23:49 +0200647B<The root folder switch using the hash sign is experimental and
648may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200649
Akron941c1a62016-02-23 17:41:41 +0100650=item B<--output|-o> <directory|file>
651
652Output folder for archive processing or
653document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100654writes to C<STDOUT> by default
655(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100656
657=item B<--overwrite|-w>
658
659Overwrite files that already exist.
660
661=item B<--token|-t> <foundry>[#<file>]
662
663Define the default tokenization by specifying
664the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100665of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100666
667=item B<--skip|-s> <foundry>[#<layer>]
668
Akronf7ad89e2016-03-16 18:22:47 +0100669Skip specific annotations by specifying the foundry
670(and optionally the layer with a C<#>-prefix),
671e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100672Can be set multiple times.
673
Akronc13a1702016-03-15 19:33:14 +0100674=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100675
Akronf7ad89e2016-03-16 18:22:47 +0100676Convert specific annotations by specifying the foundry
677(and optionally the layer with a C<#>-prefix),
678e.g. C<Mate> or C<Mate#Morpho>.
679Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100680
681=item B<--primary|-p>
682
Akronc13a1702016-03-15 19:33:14 +0100683Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100684Can be flagged using C<--no-primary> as well.
685This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100686
687=item B<--jobs|-j>
688
689Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100690for archive processing.
Akron11c80302016-03-18 19:44:43 +0100691Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100692This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100693
Akron35db6e32016-03-17 22:42:22 +0100694=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100695
Akron35db6e32016-03-17 22:42:22 +0100696Define the metadata parser to use. Defaults to C<I5>.
697Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
698This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100699
700=item B<--pretty|-y>
701
Akronc13a1702016-03-15 19:33:14 +0100702Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100703This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100704
705=item B<--gzip|-z>
706
Akronf7ad89e2016-03-16 18:22:47 +0100707Compress the output.
708Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100709
Akron11c80302016-03-18 19:44:43 +0100710=item B<--cache|-c>
711
712File to mmap a cache (using L<Cache::FastMmap>).
713Defaults to C<korapxml2krill.cache> in the calling directory.
714
715=item B<--cache-size|-cs>
716
717Size of the cache. Defaults to C<50m>.
718
719=item B<--cache-init|-ci>
720
721Initialize cache file.
722Can be flagged using C<--no-cache-init> as well.
723Defaults to C<true>.
724
725=item B<--cache-delete|-cd>
726
727Delete cache file after processing.
728Can be flagged using C<--no-cache-delete> as well.
729Defaults to C<true>.
730
Akrone10ad322016-02-27 10:54:26 +0100731=item B<--sigle|-sg>
732
Akron20807582016-10-26 17:11:34 +0200733Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100734Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100735I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200736Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200737In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200738On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100739
Akron941c1a62016-02-23 17:41:41 +0100740=item B<--log|-l>
741
742The L<Log4perl> log level, defaults to C<ERROR>.
743
744=item B<--help|-h>
745
746Print this document.
747
748=item B<--version|-v>
749
750Print version information.
751
752=back
753
Akronc13a1702016-03-15 19:33:14 +0100754=head1 ANNOTATION SUPPORT
755
756L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
757developed in the KorAP project that are part of the KorAP preprocessing pipeline.
758The base foundry with paragraphs, sentences, and the text element are mandatory for
759L<Krill|https://github.com/KorAP/Krill>.
760
Akronf7ad89e2016-03-16 18:22:47 +0100761=over 2
Akronc13a1702016-03-15 19:33:14 +0100762
763=item B<Base>
764
765=over 4
766
Akronf7ad89e2016-03-16 18:22:47 +0100767=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100768
Akronf7ad89e2016-03-16 18:22:47 +0100769=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100770
771=back
772
773=item B<Connexor>
774
775=over 4
776
Akronf7ad89e2016-03-16 18:22:47 +0100777=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100778
Akronf7ad89e2016-03-16 18:22:47 +0100779=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100780
Akronf7ad89e2016-03-16 18:22:47 +0100781=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100782
Akronf7ad89e2016-03-16 18:22:47 +0100783=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100784
785=back
786
787=item B<CoreNLP>
788
789=over 4
790
Akronf7ad89e2016-03-16 18:22:47 +0100791=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100792
Akronf7ad89e2016-03-16 18:22:47 +0100793=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100794
Akronf7ad89e2016-03-16 18:22:47 +0100795=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100796
Akronf7ad89e2016-03-16 18:22:47 +0100797=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100798
799=back
800
801=item B<DeReKo>
802
803=over 4
804
Akronf7ad89e2016-03-16 18:22:47 +0100805=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100806
807=back
808
809=item B<Glemm>
810
811=over 4
812
Akronf7ad89e2016-03-16 18:22:47 +0100813=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100814
815=back
816
817=item B<Mate>
818
819=over 4
820
Akronf7ad89e2016-03-16 18:22:47 +0100821=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100822
Akronf7ad89e2016-03-16 18:22:47 +0100823=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100824
825=back
826
827=item B<OpenNLP>
828
829=over 4
830
Akronf7ad89e2016-03-16 18:22:47 +0100831=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100832
Akronf7ad89e2016-03-16 18:22:47 +0100833=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100834
835=back
836
837=item B<Sgbr>
838
839=over 4
840
Akronf7ad89e2016-03-16 18:22:47 +0100841=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100842
Akronf7ad89e2016-03-16 18:22:47 +0100843=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100844
845=back
846
847=item B<TreeTagger>
848
849=over 4
850
Akronf7ad89e2016-03-16 18:22:47 +0100851=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100852
Akronf7ad89e2016-03-16 18:22:47 +0100853=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100854
855=back
856
857=item B<XIP>
858
859=over 4
860
Akronf7ad89e2016-03-16 18:22:47 +0100861=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100862
Akronf7ad89e2016-03-16 18:22:47 +0100863=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100864
Akronf7ad89e2016-03-16 18:22:47 +0100865=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100866
867=back
868
869=back
870
871More importers are in preparation.
872New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
873See the built-in annotation importers as examples.
874
Akron941c1a62016-02-23 17:41:41 +0100875=head1 AVAILABILITY
876
877 https://github.com/KorAP/KorAP-XML-Krill
878
879
880=head1 COPYRIGHT AND LICENSE
881
882Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100883
Akron941c1a62016-02-23 17:41:41 +0100884Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200885Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100886
887L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
888Corpus Analysis Platform at the
889L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
890member of the
891L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
892
893This program is free software published under the
894L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
895
896=cut