blob: f8407959b49553ec54bb7d22af245fde451dfc31 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010019# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010020
Akron941c1a62016-02-23 17:41:41 +010021# CHANGES:
22# ----------------------------------------------------------
23# 2013/11/25
24# - Initial release
25#
26# 2014/10/29
27# - Merges foundry data to create indexer friendly documents
28#
Akron93d620e2016-02-05 19:40:05 +010029# 2016/02/04
30# - renamed to korapxml2krill
31# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010032#
33# 2016/02/12
34# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010035# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010036#
37# 2016/02/14
38# - Added version information
Akron941c1a62016-02-23 17:41:41 +010039# - Added support for archive files
40#
41# 2016/02/15
42# - Fixed temporary directory bug
43# - Improved skipping before unzipping
44# - Added EXPERIMENTAL concurrency support
45#
46# 2016/02/23
47# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010048#
49# 2016/02/27
50# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010051#
52# 2016/03/17
53# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010054#
55# 2016/03/18
56# - Added meta data caching
Akron941c1a62016-02-23 17:41:41 +010057# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010058
Akron35db6e32016-03-17 22:42:22 +010059our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010060our $LOCAL = $FindBin::Bin;
61our $VERSION_MSG = <<"VERSION";
62Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
63VERSION
64
65
66# Parse comand
67my $cmd;
68our @ARGV;
69if ($ARGV[0] && index($ARGV[0], '-') != 0) {
70 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010071};
Akron93d620e2016-02-05 19:40:05 +010072
Akron08385f62016-03-22 20:37:04 +010073my (@skip, @sigle, @input);
Akron35db6e32016-03-17 22:42:22 +010074my $text;
Akrone10ad322016-02-27 10:54:26 +010075
Akron941c1a62016-02-23 17:41:41 +010076# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000077GetOptions(
Akron08385f62016-03-22 20:37:04 +010078 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010079 'output|o=s' => \(my $output),
80 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010081 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010082 'token|t=s' => \(my $token_base),
83 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010084 'skip|s=s' => \@skip,
85 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010086 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
87 'cache-size|cs=s' => \(my $cache_size = '50m'),
88 'cache-delete|cd!' => \(my $cache_delete = 1),
89 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010090 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010091 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010092 'primary|p!' => \(my $primary),
93 'pretty|y' => \(my $pretty),
94 'jobs|j=i' => \(my $jobs = 0),
95 'help|h' => sub {
96 pod2usage(
97 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
98 -verbose => 99,
99 -msg => $VERSION_MSG,
100 );
101 },
102 'version|v' => sub {
103 pod2usage(
104 -verbose => 0,
105 -msg => $VERSION_MSG
106 )
107 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000108);
109
Akron941c1a62016-02-23 17:41:41 +0100110my %ERROR_HASH = (
111 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
112 -verbose => 99,
113 -msg => $VERSION_MSG,
114 -exit => 1
115);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000116
Akron941c1a62016-02-23 17:41:41 +0100117# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100118pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000119
Nils Diewald7364d1f2013-11-05 19:26:35 +0000120
Akron941c1a62016-02-23 17:41:41 +0100121# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000122Log::Log4perl->init({
123 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
124 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
125 'log4perl.appender.STDERR.layout' => 'PatternLayout',
126 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
127});
128
129my $log = Log::Log4perl->get_logger('main');
130
Akron941c1a62016-02-23 17:41:41 +0100131
132# Get file name based on path information
133sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100134 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100135 my $file = shift;
Akron08385f62016-03-22 20:37:04 +0100136 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100137 $file =~ tr/\//-/;
138 $file =~ s{^-+}{};
139 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000140};
141
Akron941c1a62016-02-23 17:41:41 +0100142
143# Write file
144sub write_file {
145 my $anno = shift;
146 my $file = get_file_name $anno;
147
148 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
149
Akron08385f62016-03-22 20:37:04 +0100150 my $call = 'perl ' . $LOCAL . '/korapxml2krill';
151 $call .= ' -i ' . $anno;
152 $call .= ' -o ' . $output . '/' . $file . '.json';
Akron941c1a62016-02-23 17:41:41 +0100153 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100154 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100155 $call .= ' -w' if $overwrite;
156 $call .= ' -t ' . $token_base if $token_base;
157 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100158 $call .= ' -c ' . $cache_file;
159 $call .= ' -cs ' . $cache_size;
160 $call .= ' --no-cache-delete'; # Don't delete the cache
161 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100162 $call .= ' --no-primary ' if $primary;
163 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100164 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100165 $call .= ' -s ' . $_ foreach @skip;
166 system($call);
167 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000168};
169
Nils Diewald2db9ad02013-10-29 19:26:43 +0000170
Akrone10ad322016-02-27 10:54:26 +0100171# Convert sigle to path construct
172s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
173
Akron941c1a62016-02-23 17:41:41 +0100174# Process a single file
175unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100176 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000177
Akron941c1a62016-02-23 17:41:41 +0100178 # Can't print gzip to STDOUT
179 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000180
Akron941c1a62016-02-23 17:41:41 +0100181 my %skip;
182 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000183
Akron941c1a62016-02-23 17:41:41 +0100184 # Ignore processing
185 if (!$overwrite && $output && -e $output) {
186 $log->trace($output . ' already exists');
187 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000188 };
Akron941c1a62016-02-23 17:41:41 +0100189
190 BEGIN {
191 $main::TIME = Benchmark->new;
192 $main::LAST_STOP = Benchmark->new;
193 };
194
195 sub stop_time {
196 my $new = Benchmark->new;
197 $log->trace(
198 'The code took: '.
199 timestr(timediff($new, $main::LAST_STOP)) .
200 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
201 );
202 $main::LAST_STOP = $new;
203 };
204
205 # Create and parse new document
206 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100207 my $doc = KorAP::XML::Krill->new(
208 path => $input,
Akron11c80302016-03-18 19:44:43 +0100209 meta_type => ($meta // 'I5'),
210 cache => Cache::FastMmap->new(
211 share_file => $cache_file,
212 cache_size => $cache_size,
213 init_file => $cache_init
214 )
Akron35db6e32016-03-17 22:42:22 +0100215 );
Akron941c1a62016-02-23 17:41:41 +0100216
217 unless ($doc->parse) {
218 $log->warn($output . " can't be processed - no document data");
219 exit(0);
220 };
221
222 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
223 if ($token_base) {
224 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
225 };
226
227 # Get tokenization
228 my $tokens = KorAP::XML::Tokenizer->new(
229 path => $doc->path,
230 doc => $doc,
231 foundry => $token_base_foundry,
232 layer => $token_base_layer,
233 name => 'tokens'
234 );
235
236 # Unable to process base tokenization
237 unless ($tokens->parse) {
238 $log->error($output . " can't be processed - no base tokenization");
239 exit(0);
240 };
241
242 my @layers;
243 push(@layers, ['Base', 'Sentences']);
244 push(@layers, ['Base', 'Paragraphs']);
245
246 # Connexor
247 push(@layers, ['Connexor', 'Morpho']);
248 push(@layers, ['Connexor', 'Syntax']);
249 push(@layers, ['Connexor', 'Phrase']);
250 push(@layers, ['Connexor', 'Sentences']);
251
252 # CoreNLP
253 push(@layers, ['CoreNLP', 'NamedEntities']);
254 push(@layers, ['CoreNLP', 'Sentences']);
255 push(@layers, ['CoreNLP', 'Morpho']);
256 push(@layers, ['CoreNLP', 'Constituency']);
257
258 # DeReKo
259 push(@layers, ['DeReKo', 'Structure']);
260
261 # Glemm
262 push(@layers, ['Glemm', 'Morpho']);
263
264 # Malt
265 # push(@layers, ['Malt', 'Dependency']);
266
267 # Mate
268 push(@layers, ['Mate', 'Morpho']);
269 push(@layers, ['Mate', 'Dependency']);
270
271 # OpenNLP
272 push(@layers, ['OpenNLP', 'Morpho']);
273 push(@layers, ['OpenNLP', 'Sentences']);
274
275 # Schreibgebrauch
276 push(@layers, ['Sgbr', 'Lemma']);
277 push(@layers, ['Sgbr', 'Morpho']);
278
279 # TreeTagger
280 push(@layers, ['TreeTagger', 'Morpho']);
281 push(@layers, ['TreeTagger', 'Sentences']);
282
283 # XIP
284 push(@layers, ['XIP', 'Morpho']);
285 push(@layers, ['XIP', 'Constituency']);
286 push(@layers, ['XIP', 'Sentences']);
287 push(@layers, ['XIP', 'Dependency']);
288
289
290 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100291 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100292 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000293 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000294 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000295 }
296 else {
Akron941c1a62016-02-23 17:41:41 +0100297 # Add to index file - respect skipping
298 foreach my $info (@layers) {
299 # Skip if Foundry or Foundry#Layer should be skipped
300 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
301 $tokens->add(@$info);
302 stop_time;
303 };
304 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000305 };
306
Akron941c1a62016-02-23 17:41:41 +0100307 my $file;
Akron35db6e32016-03-17 22:42:22 +0100308 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100309
310 if ($output) {
311
312 if ($gzip) {
313 $file = IO::Compress::Gzip->new($output, Minimal => 1);
314 }
315 else {
316 $file = IO::File->new($output, "w");
317 };
318
319 $file->print($print_text);
320 $file->close;
321 }
322
323 else {
324 print $print_text . "\n";
325 };
326
Akron11c80302016-03-18 19:44:43 +0100327 # Delete cache file
328 unlink($cache_file) if $cache_delete;
329
Akron941c1a62016-02-23 17:41:41 +0100330 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000331}
Nils Diewald59094f22014-11-05 18:20:50 +0000332
Akrone10ad322016-02-27 10:54:26 +0100333# Extract XML files
334elsif ($cmd eq 'extract') {
335
Akron08385f62016-03-22 20:37:04 +0100336 my $input = $input[0];
337
Akrone10ad322016-02-27 10:54:26 +0100338 pod2usage(%ERROR_HASH) unless $output;
339
340 # TODO: Support sigles and full archives
341
342 if ($output && (!-e $output || !-d $output)) {
343 print "Directory '$output' does not exist.\n\n";
344 exit(0);
345 };
346
Akron08385f62016-03-22 20:37:04 +0100347#TODOOOOOO
348
Akrone10ad322016-02-27 10:54:26 +0100349 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
350
351 unless ($archive->test_unzip) {
352 print "Unzip is not installed or incompatible.\n\n";
353 exit(1);
354 };
355
Akrone10ad322016-02-27 10:54:26 +0100356 # Iterate over all given sigles and extract
357 foreach (@sigle) {
358 print "$_ ";
359 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
360 print "extracted.\n";
361 };
362
363 print "\n";
364 exit(1);
365 };
366}
367
Akron941c1a62016-02-23 17:41:41 +0100368# Process an archive
369elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000370
Akrone10ad322016-02-27 10:54:26 +0100371 # TODO: Support sigles
372
Akron941c1a62016-02-23 17:41:41 +0100373 pod2usage(%ERROR_HASH) unless $output;
374
375 if ($output && (!-e $output || !-d $output)) {
376 print "Directory '$output' does not exist.\n\n";
377 exit(0);
378 };
379
380 # Zero means: everything runs in the parent process
381 my $pool = Parallel::ForkManager->new($jobs);
382
383 my $count = 0; # Texts to process
384 my $iter = 1; # Current text in process
385
386 # Report on fork message
387 $pool->run_on_finish (
388 sub {
389 my ($pid, $code) = shift;
390 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100391 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100392 ($iter++) . "/$count]" .
393 ($code ? " $code" : '') .
394 " $$data\n";
395 }
396 );
397
398 my $t;
399 print "Reading data ...\n";
400
Akron11c80302016-03-18 19:44:43 +0100401 unless (Cache::FastMmap->new(
402 share_file => $cache_file,
403 cache_size => $cache_size,
404 init_file => $cache_init
405 )) {
406 print "Unable to intialize cache '$cache_file'\n\n";
407 exit(1);
408 };
409
Akron941c1a62016-02-23 17:41:41 +0100410 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100411 if (-d $input[0]) {
412 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100413 my @dirs;
414 my $dir;
415
416 while (1) {
417 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
418 push @dirs, $dir;
419 $it->prune;
420 };
421 last unless $it->next;
422 };
423
424 print "Start processing ...\n";
425 $t = Benchmark->new;
426 $count = scalar @dirs;
427
428 DIRECTORY_LOOP:
429 for (my $i = 0; $i < $count; $i++) {
430
431 unless ($overwrite) {
432 my $filename = catfile(
433 $output,
434 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
435 );
436
437 if (-e $filename) {
438 $iter++;
439 print "Skip $filename\n";
440 next;
441 };
442 };
443
444 # Get the next fork
445 my $pid = $pool->start and next DIRECTORY_LOOP;
446 my $msg;
447
448 $msg = write_file($dirs[$i]);
449 $pool->finish(0, \$msg);
450 };
451 }
452
453 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200454 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron941c1a62016-02-23 17:41:41 +0100455 unless ($archive->test_unzip) {
456 print "Unzip is not installed or incompatible.\n\n";
457 exit(1);
458 };
459
Akron08385f62016-03-22 20:37:04 +0100460 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200461 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100462
Akron941c1a62016-02-23 17:41:41 +0100463 print "Start processing ...\n";
464 $t = Benchmark->new;
465 my @dirs = $archive->list_texts;
466 $count = scalar @dirs;
467
468 ARCHIVE_LOOP:
469 for (my $i = 0; $i < $count; $i++) {
470
471 # Split path information
472 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
473
474 unless ($overwrite) {
475 my $filename = catfile(
476 $output,
477 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
478 );
479
480 if (-e $filename) {
481 $iter++;
482 print "Skip $filename\n";
483 next;
484 };
485 };
486
487 # Get the next fork
488 my $pid = $pool->start and next ARCHIVE_LOOP;
489
490 # Create temporary file
491 my $temp = File::Temp->newdir;
492
493 my $msg;
494
495 # Extract from archive
496 if ($archive->extract($dirs[$i], $temp)) {
497
498 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100499 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100500
501 # Temporary directory
502 my $dir = catdir($input, $doc, $text);
503
504 # Write file
505 $msg = write_file($dir);
506
507 $temp = undef;
508 $pool->finish(0, \$msg);
509 }
510 else {
511
512 $temp = undef;
513 $msg = "Unable to extract " . $dirs[$i] . "\n";
514 $pool->finish(1, \$msg);
515 };
516 };
517 }
518
519 else {
520 print "Input is neither a directory nor an archive.\n\n";
521 };
522
523 $pool->wait_all_children;
524
Akron11c80302016-03-18 19:44:43 +0100525 # Delete cache file
526 unlink($cache_file) if $cache_delete;
527
Akron941c1a62016-02-23 17:41:41 +0100528 print "Done.\n";
529 print timestr(timediff(Benchmark->new, $t))."\n\n";
530}
531
532# Unknown command
533else {
534 warn "Unknown command '$cmd'.\n\n";
535 pod2usage(%ERROR_HASH);
536}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000537
538__END__
Akron941c1a62016-02-23 17:41:41 +0100539
540=pod
541
542=encoding utf8
543
544=head1 NAME
545
Akronf7ad89e2016-03-16 18:22:47 +0100546korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100547
548
549=head1 SYNOPSIS
550
Akronc13a1702016-03-15 19:33:14 +0100551 $ korapxml2krill -z --input <directory> --output <filename>
552 $ korapxml2krill archive -z --input <directory> --output <directory>
553 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100554
555
556=head1 DESCRIPTION
557
558L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
559compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100560The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100561
562
563=head1 INSTALLATION
564
565The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
566
567 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
568
Akronc13a1702016-03-15 19:33:14 +0100569In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100570be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100571
572
573=head1 ARGUMENTS
574
575=over 2
576
577=item B<archive>
578
Akrone10ad322016-02-27 10:54:26 +0100579Process an archive as a Zip-file or a folder of KorAP-XML documents.
580
581=item B<extract>
582
583Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100584
585=back
586
587
588=head1 OPTIONS
589
590=over 2
591
592=item B<--input|-i> <directory|file>
593
Akronf7ad89e2016-03-16 18:22:47 +0100594Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100595
596=item B<--output|-o> <directory|file>
597
598Output folder for archive processing or
599document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100600writes to C<STDOUT> by default
601(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100602
603=item B<--overwrite|-w>
604
605Overwrite files that already exist.
606
607=item B<--token|-t> <foundry>[#<file>]
608
609Define the default tokenization by specifying
610the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100611of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100612
613=item B<--skip|-s> <foundry>[#<layer>]
614
Akronf7ad89e2016-03-16 18:22:47 +0100615Skip specific annotations by specifying the foundry
616(and optionally the layer with a C<#>-prefix),
617e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100618Can be set multiple times.
619
Akronc13a1702016-03-15 19:33:14 +0100620=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100621
Akronf7ad89e2016-03-16 18:22:47 +0100622Convert specific annotations by specifying the foundry
623(and optionally the layer with a C<#>-prefix),
624e.g. C<Mate> or C<Mate#Morpho>.
625Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100626
627=item B<--primary|-p>
628
Akronc13a1702016-03-15 19:33:14 +0100629Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100630Can be flagged using C<--no-primary> as well.
631This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100632
633=item B<--jobs|-j>
634
635Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100636for archive processing.
Akron11c80302016-03-18 19:44:43 +0100637Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100638This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100639
Akron35db6e32016-03-17 22:42:22 +0100640=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100641
Akron35db6e32016-03-17 22:42:22 +0100642Define the metadata parser to use. Defaults to C<I5>.
643Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
644This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100645
646=item B<--pretty|-y>
647
Akronc13a1702016-03-15 19:33:14 +0100648Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100649This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100650
651=item B<--gzip|-z>
652
Akronf7ad89e2016-03-16 18:22:47 +0100653Compress the output.
654Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100655
Akron11c80302016-03-18 19:44:43 +0100656=item B<--cache|-c>
657
658File to mmap a cache (using L<Cache::FastMmap>).
659Defaults to C<korapxml2krill.cache> in the calling directory.
660
661=item B<--cache-size|-cs>
662
663Size of the cache. Defaults to C<50m>.
664
665=item B<--cache-init|-ci>
666
667Initialize cache file.
668Can be flagged using C<--no-cache-init> as well.
669Defaults to C<true>.
670
671=item B<--cache-delete|-cd>
672
673Delete cache file after processing.
674Can be flagged using C<--no-cache-delete> as well.
675Defaults to C<true>.
676
Akrone10ad322016-02-27 10:54:26 +0100677=item B<--sigle|-sg>
678
679Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100680Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100681I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100682
Akron941c1a62016-02-23 17:41:41 +0100683=item B<--log|-l>
684
685The L<Log4perl> log level, defaults to C<ERROR>.
686
687=item B<--help|-h>
688
689Print this document.
690
691=item B<--version|-v>
692
693Print version information.
694
695=back
696
Akronc13a1702016-03-15 19:33:14 +0100697=head1 ANNOTATION SUPPORT
698
699L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
700developed in the KorAP project that are part of the KorAP preprocessing pipeline.
701The base foundry with paragraphs, sentences, and the text element are mandatory for
702L<Krill|https://github.com/KorAP/Krill>.
703
Akronf7ad89e2016-03-16 18:22:47 +0100704=over 2
Akronc13a1702016-03-15 19:33:14 +0100705
706=item B<Base>
707
708=over 4
709
Akronf7ad89e2016-03-16 18:22:47 +0100710=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100711
Akronf7ad89e2016-03-16 18:22:47 +0100712=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100713
714=back
715
716=item B<Connexor>
717
718=over 4
719
Akronf7ad89e2016-03-16 18:22:47 +0100720=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100721
Akronf7ad89e2016-03-16 18:22:47 +0100722=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100723
Akronf7ad89e2016-03-16 18:22:47 +0100724=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100725
Akronf7ad89e2016-03-16 18:22:47 +0100726=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100727
728=back
729
730=item B<CoreNLP>
731
732=over 4
733
Akronf7ad89e2016-03-16 18:22:47 +0100734=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100735
Akronf7ad89e2016-03-16 18:22:47 +0100736=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100737
Akronf7ad89e2016-03-16 18:22:47 +0100738=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100739
Akronf7ad89e2016-03-16 18:22:47 +0100740=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100741
742=back
743
744=item B<DeReKo>
745
746=over 4
747
Akronf7ad89e2016-03-16 18:22:47 +0100748=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100749
750=back
751
752=item B<Glemm>
753
754=over 4
755
Akronf7ad89e2016-03-16 18:22:47 +0100756=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100757
758=back
759
760=item B<Mate>
761
762=over 4
763
Akronf7ad89e2016-03-16 18:22:47 +0100764=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100765
Akronf7ad89e2016-03-16 18:22:47 +0100766=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100767
768=back
769
770=item B<OpenNLP>
771
772=over 4
773
Akronf7ad89e2016-03-16 18:22:47 +0100774=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100775
Akronf7ad89e2016-03-16 18:22:47 +0100776=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100777
778=back
779
780=item B<Sgbr>
781
782=over 4
783
Akronf7ad89e2016-03-16 18:22:47 +0100784=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100785
Akronf7ad89e2016-03-16 18:22:47 +0100786=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100787
788=back
789
790=item B<TreeTagger>
791
792=over 4
793
Akronf7ad89e2016-03-16 18:22:47 +0100794=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100795
Akronf7ad89e2016-03-16 18:22:47 +0100796=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100797
798=back
799
800=item B<XIP>
801
802=over 4
803
Akronf7ad89e2016-03-16 18:22:47 +0100804=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100805
Akronf7ad89e2016-03-16 18:22:47 +0100806=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100807
Akronf7ad89e2016-03-16 18:22:47 +0100808=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100809
810=back
811
812=back
813
814More importers are in preparation.
815New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
816See the built-in annotation importers as examples.
817
Akron941c1a62016-02-23 17:41:41 +0100818=head1 AVAILABILITY
819
820 https://github.com/KorAP/KorAP-XML-Krill
821
822
823=head1 COPYRIGHT AND LICENSE
824
825Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100826
Akron941c1a62016-02-23 17:41:41 +0100827Author: L<Nils Diewald|http://nils-diewald.de/>
828
829L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
830Corpus Analysis Platform at the
831L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
832member of the
833L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
834
835This program is free software published under the
836L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
837
838=cut