blob: 5cdacc45837d2574fae4fb6410be75bc2654e7b0 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron93d620e2016-02-05 19:40:05 +010019
Akron941c1a62016-02-23 17:41:41 +010020# CHANGES:
21# ----------------------------------------------------------
22# 2013/11/25
23# - Initial release
24#
25# 2014/10/29
26# - Merges foundry data to create indexer friendly documents
27#
Akron93d620e2016-02-05 19:40:05 +010028# 2016/02/04
29# - renamed to korapxml2krill
30# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010031#
32# 2016/02/12
33# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010034# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010035#
36# 2016/02/14
37# - Added version information
Akron941c1a62016-02-23 17:41:41 +010038# - Added support for archive files
39#
40# 2016/02/15
41# - Fixed temporary directory bug
42# - Improved skipping before unzipping
43# - Added EXPERIMENTAL concurrency support
44#
45# 2016/02/23
46# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010047#
48# 2016/02/27
49# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010050#
51# 2016/03/17
52# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010053#
54# 2016/03/18
55# - Added meta data caching
Akron941c1a62016-02-23 17:41:41 +010056# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010057
Akron35db6e32016-03-17 22:42:22 +010058our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010059our $LOCAL = $FindBin::Bin;
60our $VERSION_MSG = <<"VERSION";
61Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
62VERSION
63
64
65# Parse comand
66my $cmd;
67our @ARGV;
68if ($ARGV[0] && index($ARGV[0], '-') != 0) {
69 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010070};
Akron93d620e2016-02-05 19:40:05 +010071
Akrone10ad322016-02-27 10:54:26 +010072my (@skip, @sigle);
Akron35db6e32016-03-17 22:42:22 +010073my $text;
Akrone10ad322016-02-27 10:54:26 +010074
Akron941c1a62016-02-23 17:41:41 +010075# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000076GetOptions(
Akron941c1a62016-02-23 17:41:41 +010077 'input|i=s' => \(my $input),
78 'output|o=s' => \(my $output),
79 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010080 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010081 'token|t=s' => \(my $token_base),
82 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010083 'skip|s=s' => \@skip,
84 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010085 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
86 'cache-size|cs=s' => \(my $cache_size = '50m'),
87 'cache-delete|cd!' => \(my $cache_delete = 1),
88 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010089 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010090 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010091 'primary|p!' => \(my $primary),
92 'pretty|y' => \(my $pretty),
93 'jobs|j=i' => \(my $jobs = 0),
94 'help|h' => sub {
95 pod2usage(
96 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
97 -verbose => 99,
98 -msg => $VERSION_MSG,
99 );
100 },
101 'version|v' => sub {
102 pod2usage(
103 -verbose => 0,
104 -msg => $VERSION_MSG
105 )
106 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107);
108
Akron941c1a62016-02-23 17:41:41 +0100109my %ERROR_HASH = (
110 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
111 -verbose => 99,
112 -msg => $VERSION_MSG,
113 -exit => 1
114);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000115
Akron941c1a62016-02-23 17:41:41 +0100116# Input has to be defined
117pod2usage(%ERROR_HASH) unless $input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000118
Nils Diewald7364d1f2013-11-05 19:26:35 +0000119
Akron941c1a62016-02-23 17:41:41 +0100120# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000121Log::Log4perl->init({
122 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
123 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
124 'log4perl.appender.STDERR.layout' => 'PatternLayout',
125 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
126});
127
128my $log = Log::Log4perl->get_logger('main');
129
Akron941c1a62016-02-23 17:41:41 +0100130
131# Get file name based on path information
132sub get_file_name ($) {
133 my $file = shift;
134 $file =~ s/^?\/?$input//;
135 $file =~ tr/\//-/;
136 $file =~ s{^-+}{};
137 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000138};
139
Akron941c1a62016-02-23 17:41:41 +0100140
141# Write file
142sub write_file {
143 my $anno = shift;
144 my $file = get_file_name $anno;
145
146 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
147
148 my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
149 $anno . ' -o ' . $output . '/' . $file . '.json';
150 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100151 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100152 $call .= ' -w' if $overwrite;
153 $call .= ' -t ' . $token_base if $token_base;
154 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100155 $call .= ' -c ' . $cache_file;
156 $call .= ' -cs ' . $cache_size;
157 $call .= ' --no-cache-delete'; # Don't delete the cache
158 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100159 $call .= ' --no-primary ' if $primary;
160 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100161 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100162 $call .= ' -s ' . $_ foreach @skip;
163 system($call);
164 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000165};
166
Nils Diewald2db9ad02013-10-29 19:26:43 +0000167
Akrone10ad322016-02-27 10:54:26 +0100168# Convert sigle to path construct
169s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
170
Akron941c1a62016-02-23 17:41:41 +0100171# Process a single file
172unless ($cmd) {
Nils Diewald59094f22014-11-05 18:20:50 +0000173
Akron941c1a62016-02-23 17:41:41 +0100174 # Can't print gzip to STDOUT
175 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000176
Akron941c1a62016-02-23 17:41:41 +0100177 my %skip;
178 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000179
Akron941c1a62016-02-23 17:41:41 +0100180 # Ignore processing
181 if (!$overwrite && $output && -e $output) {
182 $log->trace($output . ' already exists');
183 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000184 };
Akron941c1a62016-02-23 17:41:41 +0100185
186 BEGIN {
187 $main::TIME = Benchmark->new;
188 $main::LAST_STOP = Benchmark->new;
189 };
190
191 sub stop_time {
192 my $new = Benchmark->new;
193 $log->trace(
194 'The code took: '.
195 timestr(timediff($new, $main::LAST_STOP)) .
196 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
197 );
198 $main::LAST_STOP = $new;
199 };
200
201 # Create and parse new document
202 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100203 my $doc = KorAP::XML::Krill->new(
204 path => $input,
Akron11c80302016-03-18 19:44:43 +0100205 meta_type => ($meta // 'I5'),
206 cache => Cache::FastMmap->new(
207 share_file => $cache_file,
208 cache_size => $cache_size,
209 init_file => $cache_init
210 )
Akron35db6e32016-03-17 22:42:22 +0100211 );
Akron941c1a62016-02-23 17:41:41 +0100212
213 unless ($doc->parse) {
214 $log->warn($output . " can't be processed - no document data");
215 exit(0);
216 };
217
218 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
219 if ($token_base) {
220 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
221 };
222
223 # Get tokenization
224 my $tokens = KorAP::XML::Tokenizer->new(
225 path => $doc->path,
226 doc => $doc,
227 foundry => $token_base_foundry,
228 layer => $token_base_layer,
229 name => 'tokens'
230 );
231
232 # Unable to process base tokenization
233 unless ($tokens->parse) {
234 $log->error($output . " can't be processed - no base tokenization");
235 exit(0);
236 };
237
238 my @layers;
239 push(@layers, ['Base', 'Sentences']);
240 push(@layers, ['Base', 'Paragraphs']);
241
242 # Connexor
243 push(@layers, ['Connexor', 'Morpho']);
244 push(@layers, ['Connexor', 'Syntax']);
245 push(@layers, ['Connexor', 'Phrase']);
246 push(@layers, ['Connexor', 'Sentences']);
247
248 # CoreNLP
249 push(@layers, ['CoreNLP', 'NamedEntities']);
250 push(@layers, ['CoreNLP', 'Sentences']);
251 push(@layers, ['CoreNLP', 'Morpho']);
252 push(@layers, ['CoreNLP', 'Constituency']);
253
254 # DeReKo
255 push(@layers, ['DeReKo', 'Structure']);
256
257 # Glemm
258 push(@layers, ['Glemm', 'Morpho']);
259
260 # Malt
261 # push(@layers, ['Malt', 'Dependency']);
262
263 # Mate
264 push(@layers, ['Mate', 'Morpho']);
265 push(@layers, ['Mate', 'Dependency']);
266
267 # OpenNLP
268 push(@layers, ['OpenNLP', 'Morpho']);
269 push(@layers, ['OpenNLP', 'Sentences']);
270
271 # Schreibgebrauch
272 push(@layers, ['Sgbr', 'Lemma']);
273 push(@layers, ['Sgbr', 'Morpho']);
274
275 # TreeTagger
276 push(@layers, ['TreeTagger', 'Morpho']);
277 push(@layers, ['TreeTagger', 'Sentences']);
278
279 # XIP
280 push(@layers, ['XIP', 'Morpho']);
281 push(@layers, ['XIP', 'Constituency']);
282 push(@layers, ['XIP', 'Sentences']);
283 push(@layers, ['XIP', 'Dependency']);
284
285
286 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100287 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100288 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000289 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000290 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000291 }
292 else {
Akron941c1a62016-02-23 17:41:41 +0100293 # Add to index file - respect skipping
294 foreach my $info (@layers) {
295 # Skip if Foundry or Foundry#Layer should be skipped
296 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
297 $tokens->add(@$info);
298 stop_time;
299 };
300 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000301 };
302
Akron941c1a62016-02-23 17:41:41 +0100303 my $file;
Akron35db6e32016-03-17 22:42:22 +0100304 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100305
306 if ($output) {
307
308 if ($gzip) {
309 $file = IO::Compress::Gzip->new($output, Minimal => 1);
310 }
311 else {
312 $file = IO::File->new($output, "w");
313 };
314
315 $file->print($print_text);
316 $file->close;
317 }
318
319 else {
320 print $print_text . "\n";
321 };
322
Akron11c80302016-03-18 19:44:43 +0100323 # Delete cache file
324 unlink($cache_file) if $cache_delete;
325
Akron941c1a62016-02-23 17:41:41 +0100326 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000327}
Nils Diewald59094f22014-11-05 18:20:50 +0000328
Akrone10ad322016-02-27 10:54:26 +0100329# Extract XML files
330elsif ($cmd eq 'extract') {
331
332 pod2usage(%ERROR_HASH) unless $output;
333
334 # TODO: Support sigles and full archives
335
336 if ($output && (!-e $output || !-d $output)) {
337 print "Directory '$output' does not exist.\n\n";
338 exit(0);
339 };
340
341 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
342
343 unless ($archive->test_unzip) {
344 print "Unzip is not installed or incompatible.\n\n";
345 exit(1);
346 };
347
Akrone10ad322016-02-27 10:54:26 +0100348 # Iterate over all given sigles and extract
349 foreach (@sigle) {
350 print "$_ ";
351 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
352 print "extracted.\n";
353 };
354
355 print "\n";
356 exit(1);
357 };
358}
359
Akron941c1a62016-02-23 17:41:41 +0100360# Process an archive
361elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000362
Akrone10ad322016-02-27 10:54:26 +0100363 # TODO: Support sigles
364
Akron941c1a62016-02-23 17:41:41 +0100365 pod2usage(%ERROR_HASH) unless $output;
366
367 if ($output && (!-e $output || !-d $output)) {
368 print "Directory '$output' does not exist.\n\n";
369 exit(0);
370 };
371
372 # Zero means: everything runs in the parent process
373 my $pool = Parallel::ForkManager->new($jobs);
374
375 my $count = 0; # Texts to process
376 my $iter = 1; # Current text in process
377
378 # Report on fork message
379 $pool->run_on_finish (
380 sub {
381 my ($pid, $code) = shift;
382 my $data = pop;
383 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
384 ($iter++) . "/$count]" .
385 ($code ? " $code" : '') .
386 " $$data\n";
387 }
388 );
389
390 my $t;
391 print "Reading data ...\n";
392
Akron11c80302016-03-18 19:44:43 +0100393 unless (Cache::FastMmap->new(
394 share_file => $cache_file,
395 cache_size => $cache_size,
396 init_file => $cache_init
397 )) {
398 print "Unable to intialize cache '$cache_file'\n\n";
399 exit(1);
400 };
401
Akron941c1a62016-02-23 17:41:41 +0100402 # Input is a directory
403 if (-d $input) {
404 my $it = Directory::Iterator->new($input);
405 my @dirs;
406 my $dir;
407
408 while (1) {
409 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
410 push @dirs, $dir;
411 $it->prune;
412 };
413 last unless $it->next;
414 };
415
416 print "Start processing ...\n";
417 $t = Benchmark->new;
418 $count = scalar @dirs;
419
420 DIRECTORY_LOOP:
421 for (my $i = 0; $i < $count; $i++) {
422
423 unless ($overwrite) {
424 my $filename = catfile(
425 $output,
426 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
427 );
428
429 if (-e $filename) {
430 $iter++;
431 print "Skip $filename\n";
432 next;
433 };
434 };
435
436 # Get the next fork
437 my $pid = $pool->start and next DIRECTORY_LOOP;
438 my $msg;
439
440 $msg = write_file($dirs[$i]);
441 $pool->finish(0, \$msg);
442 };
443 }
444
445 # Input is a file
446 elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
447 unless ($archive->test_unzip) {
448 print "Unzip is not installed or incompatible.\n\n";
449 exit(1);
450 };
451
Akron941c1a62016-02-23 17:41:41 +0100452 print "Start processing ...\n";
453 $t = Benchmark->new;
454 my @dirs = $archive->list_texts;
455 $count = scalar @dirs;
456
457 ARCHIVE_LOOP:
458 for (my $i = 0; $i < $count; $i++) {
459
460 # Split path information
461 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
462
463 unless ($overwrite) {
464 my $filename = catfile(
465 $output,
466 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
467 );
468
469 if (-e $filename) {
470 $iter++;
471 print "Skip $filename\n";
472 next;
473 };
474 };
475
476 # Get the next fork
477 my $pid = $pool->start and next ARCHIVE_LOOP;
478
479 # Create temporary file
480 my $temp = File::Temp->newdir;
481
482 my $msg;
483
484 # Extract from archive
485 if ($archive->extract($dirs[$i], $temp)) {
486
487 # Create corpus directory
488 $input = catdir("$temp", $corpus);
489
490 # Temporary directory
491 my $dir = catdir($input, $doc, $text);
492
493 # Write file
494 $msg = write_file($dir);
495
496 $temp = undef;
497 $pool->finish(0, \$msg);
498 }
499 else {
500
501 $temp = undef;
502 $msg = "Unable to extract " . $dirs[$i] . "\n";
503 $pool->finish(1, \$msg);
504 };
505 };
506 }
507
508 else {
509 print "Input is neither a directory nor an archive.\n\n";
510 };
511
512 $pool->wait_all_children;
513
Akron11c80302016-03-18 19:44:43 +0100514 # Delete cache file
515 unlink($cache_file) if $cache_delete;
516
Akron941c1a62016-02-23 17:41:41 +0100517 print "Done.\n";
518 print timestr(timediff(Benchmark->new, $t))."\n\n";
519}
520
521# Unknown command
522else {
523 warn "Unknown command '$cmd'.\n\n";
524 pod2usage(%ERROR_HASH);
525}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000526
527__END__
Akron941c1a62016-02-23 17:41:41 +0100528
529=pod
530
531=encoding utf8
532
533=head1 NAME
534
Akronf7ad89e2016-03-16 18:22:47 +0100535korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100536
537
538=head1 SYNOPSIS
539
Akronc13a1702016-03-15 19:33:14 +0100540 $ korapxml2krill -z --input <directory> --output <filename>
541 $ korapxml2krill archive -z --input <directory> --output <directory>
542 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100543
544
545=head1 DESCRIPTION
546
547L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
548compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100549The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100550
551
552=head1 INSTALLATION
553
554The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
555
556 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
557
Akronc13a1702016-03-15 19:33:14 +0100558In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100559be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100560
561
562=head1 ARGUMENTS
563
564=over 2
565
566=item B<archive>
567
Akrone10ad322016-02-27 10:54:26 +0100568Process an archive as a Zip-file or a folder of KorAP-XML documents.
569
570=item B<extract>
571
572Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100573
574=back
575
576
577=head1 OPTIONS
578
579=over 2
580
581=item B<--input|-i> <directory|file>
582
Akronf7ad89e2016-03-16 18:22:47 +0100583Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100584
585=item B<--output|-o> <directory|file>
586
587Output folder for archive processing or
588document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100589writes to C<STDOUT> by default
590(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100591
592=item B<--overwrite|-w>
593
594Overwrite files that already exist.
595
596=item B<--token|-t> <foundry>[#<file>]
597
598Define the default tokenization by specifying
599the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100600of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100601
602=item B<--skip|-s> <foundry>[#<layer>]
603
Akronf7ad89e2016-03-16 18:22:47 +0100604Skip specific annotations by specifying the foundry
605(and optionally the layer with a C<#>-prefix),
606e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100607Can be set multiple times.
608
Akronc13a1702016-03-15 19:33:14 +0100609=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100610
Akronf7ad89e2016-03-16 18:22:47 +0100611Convert specific annotations by specifying the foundry
612(and optionally the layer with a C<#>-prefix),
613e.g. C<Mate> or C<Mate#Morpho>.
614Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100615
616=item B<--primary|-p>
617
Akronc13a1702016-03-15 19:33:14 +0100618Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100619Can be flagged using C<--no-primary> as well.
620This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100621
622=item B<--jobs|-j>
623
624Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100625for archive processing.
Akron11c80302016-03-18 19:44:43 +0100626Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100627This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100628
Akron35db6e32016-03-17 22:42:22 +0100629=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100630
Akron35db6e32016-03-17 22:42:22 +0100631Define the metadata parser to use. Defaults to C<I5>.
632Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
633This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100634
635=item B<--pretty|-y>
636
Akronc13a1702016-03-15 19:33:14 +0100637Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100638This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100639
640=item B<--gzip|-z>
641
Akronf7ad89e2016-03-16 18:22:47 +0100642Compress the output.
643Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100644
Akron11c80302016-03-18 19:44:43 +0100645=item B<--cache|-c>
646
647File to mmap a cache (using L<Cache::FastMmap>).
648Defaults to C<korapxml2krill.cache> in the calling directory.
649
650=item B<--cache-size|-cs>
651
652Size of the cache. Defaults to C<50m>.
653
654=item B<--cache-init|-ci>
655
656Initialize cache file.
657Can be flagged using C<--no-cache-init> as well.
658Defaults to C<true>.
659
660=item B<--cache-delete|-cd>
661
662Delete cache file after processing.
663Can be flagged using C<--no-cache-delete> as well.
664Defaults to C<true>.
665
Akrone10ad322016-02-27 10:54:26 +0100666=item B<--sigle|-sg>
667
668Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100669Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100670I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100671
Akron941c1a62016-02-23 17:41:41 +0100672=item B<--log|-l>
673
674The L<Log4perl> log level, defaults to C<ERROR>.
675
676=item B<--help|-h>
677
678Print this document.
679
680=item B<--version|-v>
681
682Print version information.
683
684=back
685
Akronc13a1702016-03-15 19:33:14 +0100686=head1 ANNOTATION SUPPORT
687
688L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
689developed in the KorAP project that are part of the KorAP preprocessing pipeline.
690The base foundry with paragraphs, sentences, and the text element are mandatory for
691L<Krill|https://github.com/KorAP/Krill>.
692
Akronf7ad89e2016-03-16 18:22:47 +0100693=over 2
Akronc13a1702016-03-15 19:33:14 +0100694
695=item B<Base>
696
697=over 4
698
Akronf7ad89e2016-03-16 18:22:47 +0100699=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100700
Akronf7ad89e2016-03-16 18:22:47 +0100701=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100702
703=back
704
705=item B<Connexor>
706
707=over 4
708
Akronf7ad89e2016-03-16 18:22:47 +0100709=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100710
Akronf7ad89e2016-03-16 18:22:47 +0100711=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100712
Akronf7ad89e2016-03-16 18:22:47 +0100713=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100714
Akronf7ad89e2016-03-16 18:22:47 +0100715=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100716
717=back
718
719=item B<CoreNLP>
720
721=over 4
722
Akronf7ad89e2016-03-16 18:22:47 +0100723=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100724
Akronf7ad89e2016-03-16 18:22:47 +0100725=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100726
Akronf7ad89e2016-03-16 18:22:47 +0100727=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100728
Akronf7ad89e2016-03-16 18:22:47 +0100729=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100730
731=back
732
733=item B<DeReKo>
734
735=over 4
736
Akronf7ad89e2016-03-16 18:22:47 +0100737=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100738
739=back
740
741=item B<Glemm>
742
743=over 4
744
Akronf7ad89e2016-03-16 18:22:47 +0100745=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100746
747=back
748
749=item B<Mate>
750
751=over 4
752
Akronf7ad89e2016-03-16 18:22:47 +0100753=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100754
Akronf7ad89e2016-03-16 18:22:47 +0100755=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100756
757=back
758
759=item B<OpenNLP>
760
761=over 4
762
Akronf7ad89e2016-03-16 18:22:47 +0100763=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100764
Akronf7ad89e2016-03-16 18:22:47 +0100765=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100766
767=back
768
769=item B<Sgbr>
770
771=over 4
772
Akronf7ad89e2016-03-16 18:22:47 +0100773=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100774
Akronf7ad89e2016-03-16 18:22:47 +0100775=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100776
777=back
778
779=item B<TreeTagger>
780
781=over 4
782
Akronf7ad89e2016-03-16 18:22:47 +0100783=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100784
Akronf7ad89e2016-03-16 18:22:47 +0100785=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100786
787=back
788
789=item B<XIP>
790
791=over 4
792
Akronf7ad89e2016-03-16 18:22:47 +0100793=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100794
Akronf7ad89e2016-03-16 18:22:47 +0100795=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100796
Akronf7ad89e2016-03-16 18:22:47 +0100797=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100798
799=back
800
801=back
802
803More importers are in preparation.
804New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
805See the built-in annotation importers as examples.
806
Akron941c1a62016-02-23 17:41:41 +0100807=head1 AVAILABILITY
808
809 https://github.com/KorAP/KorAP-XML-Krill
810
811
812=head1 COPYRIGHT AND LICENSE
813
814Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100815
Akron941c1a62016-02-23 17:41:41 +0100816Author: L<Nils Diewald|http://nils-diewald.de/>
817
818L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
819Corpus Analysis Platform at the
820L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
821member of the
822L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
823
824This program is free software published under the
825L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
826
827=cut