blob: 6b7b9df61285b322041ad26ccae6385babfeace8 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
68# - Fixed temporary path issue in script.
69#
Akron941c1a62016-02-23 17:41:41 +010070# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010071
Akron4c0cf312016-10-15 16:42:09 +020072our $LAST_CHANGE = '2016/10/15';
Akron941c1a62016-02-23 17:41:41 +010073our $LOCAL = $FindBin::Bin;
74our $VERSION_MSG = <<"VERSION";
75Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
76VERSION
77
Akron941c1a62016-02-23 17:41:41 +010078# Parse comand
79my $cmd;
80our @ARGV;
81if ($ARGV[0] && index($ARGV[0], '-') != 0) {
82 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010083};
Akron93d620e2016-02-05 19:40:05 +010084
Akron5f51d422016-08-16 16:26:43 +020085my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010086my $text;
Akrone10ad322016-02-27 10:54:26 +010087
Akron941c1a62016-02-23 17:41:41 +010088# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000089GetOptions(
Akron08385f62016-03-22 20:37:04 +010090 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010091 'output|o=s' => \(my $output),
92 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010093 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +020094 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron941c1a62016-02-23 17:41:41 +010095 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010096 'skip|s=s' => \@skip,
97 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010098 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +010099 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200100 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100101 'primary|p!' => \(my $primary),
102 'pretty|y' => \(my $pretty),
103 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200104 'cache-size|cs=s' => \(my $cache_size = '50m'),
105 'cache-delete|cd!' => \(my $cache_delete = 1),
106 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100107 'help|h' => sub {
108 pod2usage(
109 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200110 -verbose => 99,
111 -msg => $VERSION_MSG,
112 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100113 );
114 },
115 'version|v' => sub {
116 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200117 -verbose => 0,
118 -msg => $VERSION_MSG,
119 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100120 )
121 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000122);
123
Akron941c1a62016-02-23 17:41:41 +0100124my %ERROR_HASH = (
125 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200126 -verbose => 99,
127 -msg => $VERSION_MSG,
128 -output => '-',
129 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100130);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000131
Akron941c1a62016-02-23 17:41:41 +0100132# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100133pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000134
Akrone1dbc382016-07-08 22:24:52 +0200135# Gzip has no effect, if no output is given
136pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000137
Akron941c1a62016-02-23 17:41:41 +0100138# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000139Log::Log4perl->init({
140 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
141 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
142 'log4perl.appender.STDERR.layout' => 'PatternLayout',
143 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
144});
145
146my $log = Log::Log4perl->get_logger('main');
147
Akrone1dbc382016-07-08 22:24:52 +0200148my %skip;
149$skip{lc($_)} = 1 foreach @skip;
150
151my @layers;
152push(@layers, ['Base', 'Sentences']);
153push(@layers, ['Base', 'Paragraphs']);
154
155# Connexor
156push(@layers, ['Connexor', 'Morpho']);
157push(@layers, ['Connexor', 'Syntax']);
158push(@layers, ['Connexor', 'Phrase']);
159push(@layers, ['Connexor', 'Sentences']);
160
161# CoreNLP
162push(@layers, ['CoreNLP', 'NamedEntities']);
163push(@layers, ['CoreNLP', 'Sentences']);
164push(@layers, ['CoreNLP', 'Morpho']);
165push(@layers, ['CoreNLP', 'Constituency']);
166
167# DeReKo
168push(@layers, ['DeReKo', 'Structure']);
169
170# Glemm
171push(@layers, ['Glemm', 'Morpho']);
172
173# Malt
174push(@layers, ['Malt', 'Dependency']);
175
176# MDParser
177push(@layers, ['MDParser', 'Dependency']);
178
179# Mate
180push(@layers, ['Mate', 'Morpho']);
181push(@layers, ['Mate', 'Dependency']);
182
183# OpenNLP
184push(@layers, ['OpenNLP', 'Morpho']);
185push(@layers, ['OpenNLP', 'Sentences']);
186
187# Schreibgebrauch
188push(@layers, ['Sgbr', 'Lemma']);
189push(@layers, ['Sgbr', 'Morpho']);
190
191# TreeTagger
192push(@layers, ['TreeTagger', 'Morpho']);
193push(@layers, ['TreeTagger', 'Sentences']);
194
195# XIP
196push(@layers, ['XIP', 'Morpho']);
197push(@layers, ['XIP', 'Constituency']);
198push(@layers, ['XIP', 'Sentences']);
199push(@layers, ['XIP', 'Dependency']);
200
201# Check filters
202my @filtered_anno;
203if ($skip{'#all'}) {
204 foreach (@anno) {
205 push @filtered_anno, [ split('#', $_) ];
206 };
207}
208
209# Add all annotations that are not skipped
210else {
211 # Add to index file - respect skipping
212 foreach my $info (@layers) {
213 # Skip if Foundry or Foundry#Layer should be skipped
214 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
215 push @filtered_anno, $info;
216 };
217 };
218};
219
220# Get tokenization basis
221my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
222
223# TODO: This should not be initialized for batch
224my $cache = Cache::FastMmap->new(
225 share_file => $cache_file,
226 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200227 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200228);
229
Akron03b24db2016-08-16 20:54:32 +0200230# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200231my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200232 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200233 meta_type => $meta,
234 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200235 foundry => $token_base_foundry,
236 layer => $token_base_layer,
237 gzip => $gzip,
238 log => $log,
239 primary => $primary,
240 pretty => $pretty,
241 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200242);
243
Akron941c1a62016-02-23 17:41:41 +0100244
245# Get file name based on path information
246sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100247 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200248 if (-d $i) {
249 $i =~ s![^\/]+$!!;
250 };
Akron941c1a62016-02-23 17:41:41 +0100251 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200252
253 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200254 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100255 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100256 $file =~ tr/\//-/;
257 $file =~ s{^-+}{};
258 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000259};
260
Akrone10ad322016-02-27 10:54:26 +0100261# Convert sigle to path construct
262s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
263
Akron7d4cdd82016-08-17 21:39:45 +0200264if ($cmd) {
265 if ($output && (!-e $output || !-d $output)) {
266 print "Directory '$output' does not exist.\n\n";
267 exit(0);
268 };
269};
270
271
Akron941c1a62016-02-23 17:41:41 +0100272# Process a single file
273unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100274 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000275
Akron941c1a62016-02-23 17:41:41 +0100276 BEGIN {
277 $main::TIME = Benchmark->new;
278 $main::LAST_STOP = Benchmark->new;
279 };
280
281 sub stop_time {
282 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200283 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100284 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200285 timestr(timediff($new, $main::LAST_STOP)) .
286 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
287 );
Akron941c1a62016-02-23 17:41:41 +0100288 $main::LAST_STOP = $new;
289 };
290
291 # Create and parse new document
292 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100293
Akron7d4cdd82016-08-17 21:39:45 +0200294 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200295 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100296
Akron11c80302016-03-18 19:44:43 +0100297 # Delete cache file
298 unlink($cache_file) if $cache_delete;
299
Akron5f51d422016-08-16 16:26:43 +0200300 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000301}
Nils Diewald59094f22014-11-05 18:20:50 +0000302
Akrone10ad322016-02-27 10:54:26 +0100303# Extract XML files
304elsif ($cmd eq 'extract') {
305
Akron7d4cdd82016-08-17 21:39:45 +0200306 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200307 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100308
Akron7d4cdd82016-08-17 21:39:45 +0200309 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100310 unless ($archive->test_unzip) {
311 print "Unzip is not installed or incompatible.\n\n";
312 exit(1);
313 };
314
Akronb0c88db2016-06-29 16:33:18 +0200315 # Add further annotation archived
316 $archive->attach($_) foreach @input;
317
Akron651cb8d2016-08-16 21:44:49 +0200318 my $prefix = 1;
319
Akron03b24db2016-08-16 20:54:32 +0200320 # No sigles given
321 unless (@sigle) {
322
323 # Get files
324 foreach ($archive->list_texts) {
325
326 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200327 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200328
329 # TODO: Make this OS independent
330 push @sigle, join '/', $corpus, $doc, $text;
331 };
332 };
333
Akrone10ad322016-02-27 10:54:26 +0100334 # Iterate over all given sigles and extract
335 foreach (@sigle) {
336 print "$_ ";
Akron7d4cdd82016-08-17 21:39:45 +0200337
Akron03b24db2016-08-16 20:54:32 +0200338 # TODO: Make this OS independent
Akron651cb8d2016-08-16 21:44:49 +0200339 print '' . (
340 $archive->extract(
341 ($prefix ? './' : '') . $_, $output
342 ) ? '' : 'not '
343 );
Akrone10ad322016-02-27 10:54:26 +0100344 print "extracted.\n";
345 };
346
347 print "\n";
348 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200349 }
Akron7d4cdd82016-08-17 21:39:45 +0200350
351 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200352 else {
353 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100354 };
355}
356
Akron941c1a62016-02-23 17:41:41 +0100357# Process an archive
358elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000359
Akrone1dbc382016-07-08 22:24:52 +0200360 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100361
Akron7d4cdd82016-08-17 21:39:45 +0200362 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100363 my $pool = Parallel::ForkManager->new($jobs);
364
Akron7d4cdd82016-08-17 21:39:45 +0200365 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100366 my $iter = 1; # Current text in process
367
368 # Report on fork message
369 $pool->run_on_finish (
370 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200371 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100372 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200373
Akron08385f62016-03-22 20:37:04 +0100374 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200375 ($iter++) . "/$count]" .
376 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200377 ' ' . $data->[0] . "\n";
378 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100379 }
380 );
381
382 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200383 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100384 print "Reading data ...\n";
385
Akron7d4cdd82016-08-17 21:39:45 +0200386 # unless (Cache::FastMmap->new(
387 # share_file => $cache_file,
388 # cache_size => $cache_size,
389 # init_file => $cache_init
390 # )) {
391 # print "Unable to intialize cache '$cache_file'\n\n";
392 # exit(1);
393 # };
Akron11c80302016-03-18 19:44:43 +0100394
Akron941c1a62016-02-23 17:41:41 +0100395 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100396 if (-d $input[0]) {
397 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100398 my @dirs;
399 my $dir;
400
Akron7d4cdd82016-08-17 21:39:45 +0200401 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100402 while (1) {
403 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200404 push @dirs, $dir;
405 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100406 };
407 last unless $it->next;
408 };
409
410 print "Start processing ...\n";
411 $t = Benchmark->new;
412 $count = scalar @dirs;
413
414 DIRECTORY_LOOP:
415 for (my $i = 0; $i < $count; $i++) {
416
Akrone1dbc382016-07-08 22:24:52 +0200417 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200418 $output,
419 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200420 );
Akron941c1a62016-02-23 17:41:41 +0100421
422 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200423 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200424
425 if ($batch_file->process($dirs[$i] => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200426 $pool->finish(0, ["Processed " . $filename]);
Akron3ec48972016-08-17 23:24:52 +0200427 }
428 else {
Akron4c0cf312016-10-15 16:42:09 +0200429 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200430 };
Akron941c1a62016-02-23 17:41:41 +0100431 };
432 }
433
434 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200435 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200436
Akron941c1a62016-02-23 17:41:41 +0100437 unless ($archive->test_unzip) {
438 print "Unzip is not installed or incompatible.\n\n";
439 exit(1);
440 };
441
Akron08385f62016-03-22 20:37:04 +0100442 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200443 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100444
Akron941c1a62016-02-23 17:41:41 +0100445 print "Start processing ...\n";
446 $t = Benchmark->new;
447 my @dirs = $archive->list_texts;
448 $count = scalar @dirs;
449
450 ARCHIVE_LOOP:
451 for (my $i = 0; $i < $count; $i++) {
452
453 # Split path information
454 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
455
Akrone1dbc382016-07-08 22:24:52 +0200456 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200457 $output,
458 get_file_name(
459 catfile($corpus, $doc, $text)
460 . '.json' . ($gzip ? '.gz' : '')
461 )
Akrone1dbc382016-07-08 22:24:52 +0200462 );
Akron941c1a62016-02-23 17:41:41 +0100463
464 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200465 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100466
Akron4c0cf312016-10-15 16:42:09 +0200467 # Create temporary file
468 $temp = File::Temp->newdir;
469
Akron941c1a62016-02-23 17:41:41 +0100470 # Extract from archive
471 if ($archive->extract($dirs[$i], $temp)) {
472
Akron7d4cdd82016-08-17 21:39:45 +0200473 # Create corpus directory
474 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100475
Akron7d4cdd82016-08-17 21:39:45 +0200476 # Temporary directory
477 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100478
Akron7d4cdd82016-08-17 21:39:45 +0200479 # Write file
480 if ($batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200481 # Delete temporary file
482 $pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200483 }
484 else {
Akron4c0cf312016-10-15 16:42:09 +0200485 # Delete temporary file
486 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200487 };
Akron941c1a62016-02-23 17:41:41 +0100488 }
Akron7d4cdd82016-08-17 21:39:45 +0200489
490 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100491 else {
Akron4c0cf312016-10-15 16:42:09 +0200492 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100493 };
494 };
495 }
496
497 else {
498 print "Input is neither a directory nor an archive.\n\n";
499 };
500
501 $pool->wait_all_children;
502
Akron11c80302016-03-18 19:44:43 +0100503 # Delete cache file
504 unlink($cache_file) if $cache_delete;
505
Akron941c1a62016-02-23 17:41:41 +0100506 print "Done.\n";
507 print timestr(timediff(Benchmark->new, $t))."\n\n";
508}
509
510# Unknown command
511else {
512 warn "Unknown command '$cmd'.\n\n";
513 pod2usage(%ERROR_HASH);
514}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000515
516__END__
Akron941c1a62016-02-23 17:41:41 +0100517
518=pod
519
520=encoding utf8
521
522=head1 NAME
523
Akronf7ad89e2016-03-16 18:22:47 +0100524korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100525
526
527=head1 SYNOPSIS
528
Akronc13a1702016-03-15 19:33:14 +0100529 $ korapxml2krill -z --input <directory> --output <filename>
530 $ korapxml2krill archive -z --input <directory> --output <directory>
531 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100532
533
534=head1 DESCRIPTION
535
536L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
537compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100538The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100539
540
541=head1 INSTALLATION
542
543The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
544
Akronaf386982016-10-12 00:33:25 +0200545 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100546
Akronc13a1702016-03-15 19:33:14 +0100547In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100548be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200549Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akron941c1a62016-02-23 17:41:41 +0100550
551=head1 ARGUMENTS
552
553=over 2
554
555=item B<archive>
556
Akrone10ad322016-02-27 10:54:26 +0100557Process an archive as a Zip-file or a folder of KorAP-XML documents.
558
559=item B<extract>
560
561Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100562
563=back
564
565
566=head1 OPTIONS
567
568=over 2
569
Akron2cfe8092016-06-24 17:48:49 +0200570=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100571
Akronf7ad89e2016-03-16 18:22:47 +0100572Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100573
Akron0c3e3752016-06-28 15:55:53 +0200574Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200575that the first archive listed contains all primary data files
576and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200577
578 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
579
Akron0c3e3752016-06-28 15:55:53 +0200580(The directory structure follows the base directory format,
581that may include a C<.> root folder.
582In this case further archives lacking a C<.> root folder
583need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200584
Akron651cb8d2016-08-16 21:44:49 +0200585B<The root folder switch is experimental and may vanish in future versions.>
586
Akron941c1a62016-02-23 17:41:41 +0100587=item B<--output|-o> <directory|file>
588
589Output folder for archive processing or
590document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100591writes to C<STDOUT> by default
592(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100593
594=item B<--overwrite|-w>
595
596Overwrite files that already exist.
597
598=item B<--token|-t> <foundry>[#<file>]
599
600Define the default tokenization by specifying
601the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100602of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100603
604=item B<--skip|-s> <foundry>[#<layer>]
605
Akronf7ad89e2016-03-16 18:22:47 +0100606Skip specific annotations by specifying the foundry
607(and optionally the layer with a C<#>-prefix),
608e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100609Can be set multiple times.
610
Akronc13a1702016-03-15 19:33:14 +0100611=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100612
Akronf7ad89e2016-03-16 18:22:47 +0100613Convert specific annotations by specifying the foundry
614(and optionally the layer with a C<#>-prefix),
615e.g. C<Mate> or C<Mate#Morpho>.
616Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100617
618=item B<--primary|-p>
619
Akronc13a1702016-03-15 19:33:14 +0100620Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100621Can be flagged using C<--no-primary> as well.
622This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100623
624=item B<--jobs|-j>
625
626Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100627for archive processing.
Akron11c80302016-03-18 19:44:43 +0100628Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100629This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100630
Akron35db6e32016-03-17 22:42:22 +0100631=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100632
Akron35db6e32016-03-17 22:42:22 +0100633Define the metadata parser to use. Defaults to C<I5>.
634Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
635This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100636
637=item B<--pretty|-y>
638
Akronc13a1702016-03-15 19:33:14 +0100639Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100640This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100641
642=item B<--gzip|-z>
643
Akronf7ad89e2016-03-16 18:22:47 +0100644Compress the output.
645Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100646
Akron11c80302016-03-18 19:44:43 +0100647=item B<--cache|-c>
648
649File to mmap a cache (using L<Cache::FastMmap>).
650Defaults to C<korapxml2krill.cache> in the calling directory.
651
652=item B<--cache-size|-cs>
653
654Size of the cache. Defaults to C<50m>.
655
656=item B<--cache-init|-ci>
657
658Initialize cache file.
659Can be flagged using C<--no-cache-init> as well.
660Defaults to C<true>.
661
662=item B<--cache-delete|-cd>
663
664Delete cache file after processing.
665Can be flagged using C<--no-cache-delete> as well.
666Defaults to C<true>.
667
Akrone10ad322016-02-27 10:54:26 +0100668=item B<--sigle|-sg>
669
670Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100671Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100672I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200673Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100674
Akron941c1a62016-02-23 17:41:41 +0100675=item B<--log|-l>
676
677The L<Log4perl> log level, defaults to C<ERROR>.
678
679=item B<--help|-h>
680
681Print this document.
682
683=item B<--version|-v>
684
685Print version information.
686
687=back
688
Akronc13a1702016-03-15 19:33:14 +0100689=head1 ANNOTATION SUPPORT
690
691L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
692developed in the KorAP project that are part of the KorAP preprocessing pipeline.
693The base foundry with paragraphs, sentences, and the text element are mandatory for
694L<Krill|https://github.com/KorAP/Krill>.
695
Akronf7ad89e2016-03-16 18:22:47 +0100696=over 2
Akronc13a1702016-03-15 19:33:14 +0100697
698=item B<Base>
699
700=over 4
701
Akronf7ad89e2016-03-16 18:22:47 +0100702=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100703
Akronf7ad89e2016-03-16 18:22:47 +0100704=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100705
706=back
707
708=item B<Connexor>
709
710=over 4
711
Akronf7ad89e2016-03-16 18:22:47 +0100712=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100713
Akronf7ad89e2016-03-16 18:22:47 +0100714=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100715
Akronf7ad89e2016-03-16 18:22:47 +0100716=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100717
Akronf7ad89e2016-03-16 18:22:47 +0100718=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100719
720=back
721
722=item B<CoreNLP>
723
724=over 4
725
Akronf7ad89e2016-03-16 18:22:47 +0100726=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100727
Akronf7ad89e2016-03-16 18:22:47 +0100728=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100729
Akronf7ad89e2016-03-16 18:22:47 +0100730=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100731
Akronf7ad89e2016-03-16 18:22:47 +0100732=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100733
734=back
735
736=item B<DeReKo>
737
738=over 4
739
Akronf7ad89e2016-03-16 18:22:47 +0100740=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100741
742=back
743
744=item B<Glemm>
745
746=over 4
747
Akronf7ad89e2016-03-16 18:22:47 +0100748=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100749
750=back
751
752=item B<Mate>
753
754=over 4
755
Akronf7ad89e2016-03-16 18:22:47 +0100756=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100757
Akronf7ad89e2016-03-16 18:22:47 +0100758=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100759
760=back
761
762=item B<OpenNLP>
763
764=over 4
765
Akronf7ad89e2016-03-16 18:22:47 +0100766=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100767
Akronf7ad89e2016-03-16 18:22:47 +0100768=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100769
770=back
771
772=item B<Sgbr>
773
774=over 4
775
Akronf7ad89e2016-03-16 18:22:47 +0100776=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100777
Akronf7ad89e2016-03-16 18:22:47 +0100778=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100779
780=back
781
782=item B<TreeTagger>
783
784=over 4
785
Akronf7ad89e2016-03-16 18:22:47 +0100786=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100787
Akronf7ad89e2016-03-16 18:22:47 +0100788=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100789
790=back
791
792=item B<XIP>
793
794=over 4
795
Akronf7ad89e2016-03-16 18:22:47 +0100796=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100797
Akronf7ad89e2016-03-16 18:22:47 +0100798=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100799
Akronf7ad89e2016-03-16 18:22:47 +0100800=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100801
802=back
803
804=back
805
806More importers are in preparation.
807New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
808See the built-in annotation importers as examples.
809
Akron941c1a62016-02-23 17:41:41 +0100810=head1 AVAILABILITY
811
812 https://github.com/KorAP/KorAP-XML-Krill
813
814
815=head1 COPYRIGHT AND LICENSE
816
817Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100818
Akron941c1a62016-02-23 17:41:41 +0100819Author: L<Nils Diewald|http://nils-diewald.de/>
820
821L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
822Corpus Analysis Platform at the
823L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
824member of the
825L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
826
827This program is free software published under the
828L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
829
830=cut