blob: 99dd33327bf10ce485eadcd6dd09e17ca4d6485b [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron93d620e2016-02-05 19:40:05 +010019
Akron941c1a62016-02-23 17:41:41 +010020# CHANGES:
21# ----------------------------------------------------------
22# 2013/11/25
23# - Initial release
24#
25# 2014/10/29
26# - Merges foundry data to create indexer friendly documents
27#
Akron93d620e2016-02-05 19:40:05 +010028# 2016/02/04
29# - renamed to korapxml2krill
30# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010031#
32# 2016/02/12
33# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010034# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010035#
36# 2016/02/14
37# - Added version information
Akron941c1a62016-02-23 17:41:41 +010038# - Added support for archive files
39#
40# 2016/02/15
41# - Fixed temporary directory bug
42# - Improved skipping before unzipping
43# - Added EXPERIMENTAL concurrency support
44#
45# 2016/02/23
46# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010047#
48# 2016/02/27
49# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010050#
51# 2016/03/17
52# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010053#
54# 2016/03/18
55# - Added meta data caching
Akron941c1a62016-02-23 17:41:41 +010056# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010057
Akron35db6e32016-03-17 22:42:22 +010058our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010059our $LOCAL = $FindBin::Bin;
60our $VERSION_MSG = <<"VERSION";
61Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
62VERSION
63
64
65# Parse comand
66my $cmd;
67our @ARGV;
68if ($ARGV[0] && index($ARGV[0], '-') != 0) {
69 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010070};
Akron93d620e2016-02-05 19:40:05 +010071
Akrone10ad322016-02-27 10:54:26 +010072my (@skip, @sigle);
Akron35db6e32016-03-17 22:42:22 +010073my $text;
Akrone10ad322016-02-27 10:54:26 +010074
Akron941c1a62016-02-23 17:41:41 +010075# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000076GetOptions(
Akron941c1a62016-02-23 17:41:41 +010077 'input|i=s' => \(my $input),
78 'output|o=s' => \(my $output),
79 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010080 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010081 'token|t=s' => \(my $token_base),
82 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010083 'skip|s=s' => \@skip,
84 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010085 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
86 'cache-size|cs=s' => \(my $cache_size = '50m'),
87 'cache-delete|cd!' => \(my $cache_delete = 1),
88 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010089 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010090 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010091 'primary|p!' => \(my $primary),
92 'pretty|y' => \(my $pretty),
93 'jobs|j=i' => \(my $jobs = 0),
94 'help|h' => sub {
95 pod2usage(
96 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
97 -verbose => 99,
98 -msg => $VERSION_MSG,
99 );
100 },
101 'version|v' => sub {
102 pod2usage(
103 -verbose => 0,
104 -msg => $VERSION_MSG
105 )
106 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107);
108
Akron941c1a62016-02-23 17:41:41 +0100109my %ERROR_HASH = (
110 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
111 -verbose => 99,
112 -msg => $VERSION_MSG,
113 -exit => 1
114);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000115
Akron941c1a62016-02-23 17:41:41 +0100116# Input has to be defined
117pod2usage(%ERROR_HASH) unless $input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000118
Nils Diewald7364d1f2013-11-05 19:26:35 +0000119
Akron941c1a62016-02-23 17:41:41 +0100120# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000121Log::Log4perl->init({
122 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
123 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
124 'log4perl.appender.STDERR.layout' => 'PatternLayout',
125 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
126});
127
128my $log = Log::Log4perl->get_logger('main');
129
Akron941c1a62016-02-23 17:41:41 +0100130
131# Get file name based on path information
132sub get_file_name ($) {
133 my $file = shift;
134 $file =~ s/^?\/?$input//;
135 $file =~ tr/\//-/;
136 $file =~ s{^-+}{};
137 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000138};
139
Akron941c1a62016-02-23 17:41:41 +0100140
141# Write file
142sub write_file {
143 my $anno = shift;
144 my $file = get_file_name $anno;
145
146 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
147
148 my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
149 $anno . ' -o ' . $output . '/' . $file . '.json';
150 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100151 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100152 $call .= ' -w' if $overwrite;
153 $call .= ' -t ' . $token_base if $token_base;
154 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100155 $call .= ' -c ' . $cache_file;
156 $call .= ' -cs ' . $cache_size;
157 $call .= ' --no-cache-delete'; # Don't delete the cache
158 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100159 $call .= ' --no-primary ' if $primary;
160 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100161 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100162 $call .= ' -s ' . $_ foreach @skip;
163 system($call);
164 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000165};
166
Nils Diewald2db9ad02013-10-29 19:26:43 +0000167
Akrone10ad322016-02-27 10:54:26 +0100168# Convert sigle to path construct
169s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
170
Akron941c1a62016-02-23 17:41:41 +0100171# Process a single file
172unless ($cmd) {
Nils Diewald59094f22014-11-05 18:20:50 +0000173
Akron941c1a62016-02-23 17:41:41 +0100174 # Can't print gzip to STDOUT
175 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000176
Akron941c1a62016-02-23 17:41:41 +0100177 my %skip;
178 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000179
Akron941c1a62016-02-23 17:41:41 +0100180 # Ignore processing
181 if (!$overwrite && $output && -e $output) {
182 $log->trace($output . ' already exists');
183 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000184 };
Akron941c1a62016-02-23 17:41:41 +0100185
186 BEGIN {
187 $main::TIME = Benchmark->new;
188 $main::LAST_STOP = Benchmark->new;
189 };
190
191 sub stop_time {
192 my $new = Benchmark->new;
193 $log->trace(
194 'The code took: '.
195 timestr(timediff($new, $main::LAST_STOP)) .
196 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
197 );
198 $main::LAST_STOP = $new;
199 };
200
201 # Create and parse new document
202 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100203 my $doc = KorAP::XML::Krill->new(
204 path => $input,
Akron11c80302016-03-18 19:44:43 +0100205 meta_type => ($meta // 'I5'),
206 cache => Cache::FastMmap->new(
207 share_file => $cache_file,
208 cache_size => $cache_size,
209 init_file => $cache_init
210 )
Akron35db6e32016-03-17 22:42:22 +0100211 );
Akron941c1a62016-02-23 17:41:41 +0100212
213 unless ($doc->parse) {
214 $log->warn($output . " can't be processed - no document data");
215 exit(0);
216 };
217
218 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
219 if ($token_base) {
220 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
221 };
222
223 # Get tokenization
224 my $tokens = KorAP::XML::Tokenizer->new(
225 path => $doc->path,
226 doc => $doc,
227 foundry => $token_base_foundry,
228 layer => $token_base_layer,
229 name => 'tokens'
230 );
231
232 # Unable to process base tokenization
233 unless ($tokens->parse) {
234 $log->error($output . " can't be processed - no base tokenization");
235 exit(0);
236 };
237
238 my @layers;
239 push(@layers, ['Base', 'Sentences']);
240 push(@layers, ['Base', 'Paragraphs']);
241
242 # Connexor
243 push(@layers, ['Connexor', 'Morpho']);
244 push(@layers, ['Connexor', 'Syntax']);
245 push(@layers, ['Connexor', 'Phrase']);
246 push(@layers, ['Connexor', 'Sentences']);
247
248 # CoreNLP
249 push(@layers, ['CoreNLP', 'NamedEntities']);
250 push(@layers, ['CoreNLP', 'Sentences']);
251 push(@layers, ['CoreNLP', 'Morpho']);
252 push(@layers, ['CoreNLP', 'Constituency']);
253
254 # DeReKo
255 push(@layers, ['DeReKo', 'Structure']);
256
257 # Glemm
258 push(@layers, ['Glemm', 'Morpho']);
259
260 # Malt
261 # push(@layers, ['Malt', 'Dependency']);
262
263 # Mate
264 push(@layers, ['Mate', 'Morpho']);
265 push(@layers, ['Mate', 'Dependency']);
266
267 # OpenNLP
268 push(@layers, ['OpenNLP', 'Morpho']);
269 push(@layers, ['OpenNLP', 'Sentences']);
270
271 # Schreibgebrauch
272 push(@layers, ['Sgbr', 'Lemma']);
273 push(@layers, ['Sgbr', 'Morpho']);
274
275 # TreeTagger
276 push(@layers, ['TreeTagger', 'Morpho']);
277 push(@layers, ['TreeTagger', 'Sentences']);
278
279 # XIP
280 push(@layers, ['XIP', 'Morpho']);
281 push(@layers, ['XIP', 'Constituency']);
282 push(@layers, ['XIP', 'Sentences']);
283 push(@layers, ['XIP', 'Dependency']);
284
285
286 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100287 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100288 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000289 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000290 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000291 }
292 else {
Akron941c1a62016-02-23 17:41:41 +0100293 # Add to index file - respect skipping
294 foreach my $info (@layers) {
295 # Skip if Foundry or Foundry#Layer should be skipped
296 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
297 $tokens->add(@$info);
298 stop_time;
299 };
300 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000301 };
302
Akron941c1a62016-02-23 17:41:41 +0100303 my $file;
Akron35db6e32016-03-17 22:42:22 +0100304 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100305
306 if ($output) {
307
308 if ($gzip) {
309 $file = IO::Compress::Gzip->new($output, Minimal => 1);
310 }
311 else {
312 $file = IO::File->new($output, "w");
313 };
314
315 $file->print($print_text);
316 $file->close;
317 }
318
319 else {
320 print $print_text . "\n";
321 };
322
Akron11c80302016-03-18 19:44:43 +0100323 # Delete cache file
324 unlink($cache_file) if $cache_delete;
325
Akron941c1a62016-02-23 17:41:41 +0100326 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000327}
Nils Diewald59094f22014-11-05 18:20:50 +0000328
Akrone10ad322016-02-27 10:54:26 +0100329# Extract XML files
330elsif ($cmd eq 'extract') {
331
332 pod2usage(%ERROR_HASH) unless $output;
333
334 # TODO: Support sigles and full archives
335
336 if ($output && (!-e $output || !-d $output)) {
337 print "Directory '$output' does not exist.\n\n";
338 exit(0);
339 };
340
341 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
342
343 unless ($archive->test_unzip) {
344 print "Unzip is not installed or incompatible.\n\n";
345 exit(1);
346 };
347
348 # Test will be skipped
349
350 # Iterate over all given sigles and extract
351 foreach (@sigle) {
352 print "$_ ";
353 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
354 print "extracted.\n";
355 };
356
357 print "\n";
358 exit(1);
359 };
360}
361
Akron941c1a62016-02-23 17:41:41 +0100362# Process an archive
363elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000364
Akrone10ad322016-02-27 10:54:26 +0100365 # TODO: Support sigles
366
Akron941c1a62016-02-23 17:41:41 +0100367 pod2usage(%ERROR_HASH) unless $output;
368
369 if ($output && (!-e $output || !-d $output)) {
370 print "Directory '$output' does not exist.\n\n";
371 exit(0);
372 };
373
374 # Zero means: everything runs in the parent process
375 my $pool = Parallel::ForkManager->new($jobs);
376
377 my $count = 0; # Texts to process
378 my $iter = 1; # Current text in process
379
380 # Report on fork message
381 $pool->run_on_finish (
382 sub {
383 my ($pid, $code) = shift;
384 my $data = pop;
385 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
386 ($iter++) . "/$count]" .
387 ($code ? " $code" : '') .
388 " $$data\n";
389 }
390 );
391
392 my $t;
393 print "Reading data ...\n";
394
Akron11c80302016-03-18 19:44:43 +0100395 unless (Cache::FastMmap->new(
396 share_file => $cache_file,
397 cache_size => $cache_size,
398 init_file => $cache_init
399 )) {
400 print "Unable to intialize cache '$cache_file'\n\n";
401 exit(1);
402 };
403
Akron941c1a62016-02-23 17:41:41 +0100404 # Input is a directory
405 if (-d $input) {
406 my $it = Directory::Iterator->new($input);
407 my @dirs;
408 my $dir;
409
410 while (1) {
411 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
412 push @dirs, $dir;
413 $it->prune;
414 };
415 last unless $it->next;
416 };
417
418 print "Start processing ...\n";
419 $t = Benchmark->new;
420 $count = scalar @dirs;
421
422 DIRECTORY_LOOP:
423 for (my $i = 0; $i < $count; $i++) {
424
425 unless ($overwrite) {
426 my $filename = catfile(
427 $output,
428 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
429 );
430
431 if (-e $filename) {
432 $iter++;
433 print "Skip $filename\n";
434 next;
435 };
436 };
437
438 # Get the next fork
439 my $pid = $pool->start and next DIRECTORY_LOOP;
440 my $msg;
441
442 $msg = write_file($dirs[$i]);
443 $pool->finish(0, \$msg);
444 };
445 }
446
447 # Input is a file
448 elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
449 unless ($archive->test_unzip) {
450 print "Unzip is not installed or incompatible.\n\n";
451 exit(1);
452 };
453
454 unless ($archive->test) {
455 print "Zip archive not compatible.\n\n";
456 exit(1);
457 };
458
459 print "Start processing ...\n";
460 $t = Benchmark->new;
461 my @dirs = $archive->list_texts;
462 $count = scalar @dirs;
463
464 ARCHIVE_LOOP:
465 for (my $i = 0; $i < $count; $i++) {
466
467 # Split path information
468 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
469
470 unless ($overwrite) {
471 my $filename = catfile(
472 $output,
473 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
474 );
475
476 if (-e $filename) {
477 $iter++;
478 print "Skip $filename\n";
479 next;
480 };
481 };
482
483 # Get the next fork
484 my $pid = $pool->start and next ARCHIVE_LOOP;
485
486 # Create temporary file
487 my $temp = File::Temp->newdir;
488
489 my $msg;
490
491 # Extract from archive
492 if ($archive->extract($dirs[$i], $temp)) {
493
494 # Create corpus directory
495 $input = catdir("$temp", $corpus);
496
497 # Temporary directory
498 my $dir = catdir($input, $doc, $text);
499
500 # Write file
501 $msg = write_file($dir);
502
503 $temp = undef;
504 $pool->finish(0, \$msg);
505 }
506 else {
507
508 $temp = undef;
509 $msg = "Unable to extract " . $dirs[$i] . "\n";
510 $pool->finish(1, \$msg);
511 };
512 };
513 }
514
515 else {
516 print "Input is neither a directory nor an archive.\n\n";
517 };
518
519 $pool->wait_all_children;
520
Akron11c80302016-03-18 19:44:43 +0100521 # Delete cache file
522 unlink($cache_file) if $cache_delete;
523
Akron941c1a62016-02-23 17:41:41 +0100524 print "Done.\n";
525 print timestr(timediff(Benchmark->new, $t))."\n\n";
526}
527
528# Unknown command
529else {
530 warn "Unknown command '$cmd'.\n\n";
531 pod2usage(%ERROR_HASH);
532}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000533
534__END__
Akron941c1a62016-02-23 17:41:41 +0100535
536=pod
537
538=encoding utf8
539
540=head1 NAME
541
Akronf7ad89e2016-03-16 18:22:47 +0100542korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100543
544
545=head1 SYNOPSIS
546
Akronc13a1702016-03-15 19:33:14 +0100547 $ korapxml2krill -z --input <directory> --output <filename>
548 $ korapxml2krill archive -z --input <directory> --output <directory>
549 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100550
551
552=head1 DESCRIPTION
553
554L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
555compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100556The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100557
558
559=head1 INSTALLATION
560
561The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
562
563 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
564
Akronc13a1702016-03-15 19:33:14 +0100565In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100566be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100567
568
569=head1 ARGUMENTS
570
571=over 2
572
573=item B<archive>
574
Akrone10ad322016-02-27 10:54:26 +0100575Process an archive as a Zip-file or a folder of KorAP-XML documents.
576
577=item B<extract>
578
579Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100580
581=back
582
583
584=head1 OPTIONS
585
586=over 2
587
588=item B<--input|-i> <directory|file>
589
Akronf7ad89e2016-03-16 18:22:47 +0100590Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100591
592=item B<--output|-o> <directory|file>
593
594Output folder for archive processing or
595document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100596writes to C<STDOUT> by default
597(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100598
599=item B<--overwrite|-w>
600
601Overwrite files that already exist.
602
603=item B<--token|-t> <foundry>[#<file>]
604
605Define the default tokenization by specifying
606the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100607of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100608
609=item B<--skip|-s> <foundry>[#<layer>]
610
Akronf7ad89e2016-03-16 18:22:47 +0100611Skip specific annotations by specifying the foundry
612(and optionally the layer with a C<#>-prefix),
613e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100614Can be set multiple times.
615
Akronc13a1702016-03-15 19:33:14 +0100616=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100617
Akronf7ad89e2016-03-16 18:22:47 +0100618Convert specific annotations by specifying the foundry
619(and optionally the layer with a C<#>-prefix),
620e.g. C<Mate> or C<Mate#Morpho>.
621Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100622
623=item B<--primary|-p>
624
Akronc13a1702016-03-15 19:33:14 +0100625Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100626Can be flagged using C<--no-primary> as well.
627This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100628
629=item B<--jobs|-j>
630
631Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100632for archive processing.
Akron11c80302016-03-18 19:44:43 +0100633Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100634This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100635
Akron35db6e32016-03-17 22:42:22 +0100636=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100637
Akron35db6e32016-03-17 22:42:22 +0100638Define the metadata parser to use. Defaults to C<I5>.
639Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
640This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100641
642=item B<--pretty|-y>
643
Akronc13a1702016-03-15 19:33:14 +0100644Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100645This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100646
647=item B<--gzip|-z>
648
Akronf7ad89e2016-03-16 18:22:47 +0100649Compress the output.
650Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100651
Akron11c80302016-03-18 19:44:43 +0100652=item B<--cache|-c>
653
654File to mmap a cache (using L<Cache::FastMmap>).
655Defaults to C<korapxml2krill.cache> in the calling directory.
656
657=item B<--cache-size|-cs>
658
659Size of the cache. Defaults to C<50m>.
660
661=item B<--cache-init|-ci>
662
663Initialize cache file.
664Can be flagged using C<--no-cache-init> as well.
665Defaults to C<true>.
666
667=item B<--cache-delete|-cd>
668
669Delete cache file after processing.
670Can be flagged using C<--no-cache-delete> as well.
671Defaults to C<true>.
672
Akrone10ad322016-02-27 10:54:26 +0100673=item B<--sigle|-sg>
674
675Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100676Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100677I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100678
Akron941c1a62016-02-23 17:41:41 +0100679=item B<--log|-l>
680
681The L<Log4perl> log level, defaults to C<ERROR>.
682
683=item B<--help|-h>
684
685Print this document.
686
687=item B<--version|-v>
688
689Print version information.
690
691=back
692
Akronc13a1702016-03-15 19:33:14 +0100693=head1 ANNOTATION SUPPORT
694
695L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
696developed in the KorAP project that are part of the KorAP preprocessing pipeline.
697The base foundry with paragraphs, sentences, and the text element are mandatory for
698L<Krill|https://github.com/KorAP/Krill>.
699
Akronf7ad89e2016-03-16 18:22:47 +0100700=over 2
Akronc13a1702016-03-15 19:33:14 +0100701
702=item B<Base>
703
704=over 4
705
Akronf7ad89e2016-03-16 18:22:47 +0100706=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100707
Akronf7ad89e2016-03-16 18:22:47 +0100708=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100709
710=back
711
712=item B<Connexor>
713
714=over 4
715
Akronf7ad89e2016-03-16 18:22:47 +0100716=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100717
Akronf7ad89e2016-03-16 18:22:47 +0100718=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100719
Akronf7ad89e2016-03-16 18:22:47 +0100720=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100721
Akronf7ad89e2016-03-16 18:22:47 +0100722=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100723
724=back
725
726=item B<CoreNLP>
727
728=over 4
729
Akronf7ad89e2016-03-16 18:22:47 +0100730=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100731
Akronf7ad89e2016-03-16 18:22:47 +0100732=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100733
Akronf7ad89e2016-03-16 18:22:47 +0100734=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100735
Akronf7ad89e2016-03-16 18:22:47 +0100736=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100737
738=back
739
740=item B<DeReKo>
741
742=over 4
743
Akronf7ad89e2016-03-16 18:22:47 +0100744=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100745
746=back
747
748=item B<Glemm>
749
750=over 4
751
Akronf7ad89e2016-03-16 18:22:47 +0100752=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100753
754=back
755
756=item B<Mate>
757
758=over 4
759
Akronf7ad89e2016-03-16 18:22:47 +0100760=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100761
Akronf7ad89e2016-03-16 18:22:47 +0100762=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100763
764=back
765
766=item B<OpenNLP>
767
768=over 4
769
Akronf7ad89e2016-03-16 18:22:47 +0100770=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100771
Akronf7ad89e2016-03-16 18:22:47 +0100772=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100773
774=back
775
776=item B<Sgbr>
777
778=over 4
779
Akronf7ad89e2016-03-16 18:22:47 +0100780=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100781
Akronf7ad89e2016-03-16 18:22:47 +0100782=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100783
784=back
785
786=item B<TreeTagger>
787
788=over 4
789
Akronf7ad89e2016-03-16 18:22:47 +0100790=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100791
Akronf7ad89e2016-03-16 18:22:47 +0100792=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100793
794=back
795
796=item B<XIP>
797
798=over 4
799
Akronf7ad89e2016-03-16 18:22:47 +0100800=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100801
Akronf7ad89e2016-03-16 18:22:47 +0100802=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100803
Akronf7ad89e2016-03-16 18:22:47 +0100804=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100805
806=back
807
808=back
809
810More importers are in preparation.
811New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
812See the built-in annotation importers as examples.
813
Akron941c1a62016-02-23 17:41:41 +0100814=head1 AVAILABILITY
815
816 https://github.com/KorAP/KorAP-XML-Krill
817
818
819=head1 COPYRIGHT AND LICENSE
820
821Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100822
Akron941c1a62016-02-23 17:41:41 +0100823Author: L<Nils Diewald|http://nils-diewald.de/>
824
825L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
826Corpus Analysis Platform at the
827L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
828member of the
829L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
830
831This program is free software published under the
832L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
833
834=cut