blob: f84439f68eb148d736987f013e69587533f36e71 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010019# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010020
Akron941c1a62016-02-23 17:41:41 +010021# CHANGES:
22# ----------------------------------------------------------
23# 2013/11/25
24# - Initial release
25#
26# 2014/10/29
27# - Merges foundry data to create indexer friendly documents
28#
Akron93d620e2016-02-05 19:40:05 +010029# 2016/02/04
30# - renamed to korapxml2krill
31# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010032#
33# 2016/02/12
34# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010035# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010036#
37# 2016/02/14
38# - Added version information
Akron941c1a62016-02-23 17:41:41 +010039# - Added support for archive files
40#
41# 2016/02/15
42# - Fixed temporary directory bug
43# - Improved skipping before unzipping
44# - Added EXPERIMENTAL concurrency support
45#
46# 2016/02/23
47# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010048#
49# 2016/02/27
50# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010051#
52# 2016/03/17
53# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010054#
55# 2016/03/18
56# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020057#
58# 2016/06/24
59# - Added multi archive support
60# - Added prefix negation support
Akron941c1a62016-02-23 17:41:41 +010061# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010062
Akron35db6e32016-03-17 22:42:22 +010063our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010064our $LOCAL = $FindBin::Bin;
65our $VERSION_MSG = <<"VERSION";
66Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
67VERSION
68
69
70# Parse comand
71my $cmd;
72our @ARGV;
73if ($ARGV[0] && index($ARGV[0], '-') != 0) {
74 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010075};
Akron93d620e2016-02-05 19:40:05 +010076
Akron08385f62016-03-22 20:37:04 +010077my (@skip, @sigle, @input);
Akron35db6e32016-03-17 22:42:22 +010078my $text;
Akrone10ad322016-02-27 10:54:26 +010079
Akron941c1a62016-02-23 17:41:41 +010080# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000081GetOptions(
Akron08385f62016-03-22 20:37:04 +010082 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010083 'output|o=s' => \(my $output),
84 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010085 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010086 'token|t=s' => \(my $token_base),
87 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010088 'skip|s=s' => \@skip,
89 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010090 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
91 'cache-size|cs=s' => \(my $cache_size = '50m'),
92 'cache-delete|cd!' => \(my $cache_delete = 1),
93 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010094 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010095 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010096 'primary|p!' => \(my $primary),
97 'pretty|y' => \(my $pretty),
98 'jobs|j=i' => \(my $jobs = 0),
99 'help|h' => sub {
100 pod2usage(
101 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
102 -verbose => 99,
103 -msg => $VERSION_MSG,
104 );
105 },
106 'version|v' => sub {
107 pod2usage(
108 -verbose => 0,
109 -msg => $VERSION_MSG
110 )
111 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000112);
113
Akron941c1a62016-02-23 17:41:41 +0100114my %ERROR_HASH = (
115 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
116 -verbose => 99,
117 -msg => $VERSION_MSG,
118 -exit => 1
119);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000120
Akron941c1a62016-02-23 17:41:41 +0100121# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100122pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000123
Nils Diewald7364d1f2013-11-05 19:26:35 +0000124
Akron941c1a62016-02-23 17:41:41 +0100125# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000126Log::Log4perl->init({
127 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
128 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
129 'log4perl.appender.STDERR.layout' => 'PatternLayout',
130 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
131});
132
133my $log = Log::Log4perl->get_logger('main');
134
Akron941c1a62016-02-23 17:41:41 +0100135
136# Get file name based on path information
137sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100138 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100139 my $file = shift;
Akron08385f62016-03-22 20:37:04 +0100140 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100141 $file =~ tr/\//-/;
142 $file =~ s{^-+}{};
143 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000144};
145
Akron941c1a62016-02-23 17:41:41 +0100146
147# Write file
148sub write_file {
149 my $anno = shift;
150 my $file = get_file_name $anno;
151
152 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
153
Akron08385f62016-03-22 20:37:04 +0100154 my $call = 'perl ' . $LOCAL . '/korapxml2krill';
155 $call .= ' -i ' . $anno;
156 $call .= ' -o ' . $output . '/' . $file . '.json';
Akron941c1a62016-02-23 17:41:41 +0100157 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100158 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100159 $call .= ' -w' if $overwrite;
160 $call .= ' -t ' . $token_base if $token_base;
161 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100162 $call .= ' -c ' . $cache_file;
163 $call .= ' -cs ' . $cache_size;
164 $call .= ' --no-cache-delete'; # Don't delete the cache
165 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100166 $call .= ' --no-primary ' if $primary;
167 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100168 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100169 $call .= ' -s ' . $_ foreach @skip;
170 system($call);
171 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000172};
173
Nils Diewald2db9ad02013-10-29 19:26:43 +0000174
Akrone10ad322016-02-27 10:54:26 +0100175# Convert sigle to path construct
176s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
177
Akron941c1a62016-02-23 17:41:41 +0100178# Process a single file
179unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100180 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000181
Akron941c1a62016-02-23 17:41:41 +0100182 # Can't print gzip to STDOUT
183 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000184
Akron941c1a62016-02-23 17:41:41 +0100185 my %skip;
186 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000187
Akron941c1a62016-02-23 17:41:41 +0100188 # Ignore processing
189 if (!$overwrite && $output && -e $output) {
190 $log->trace($output . ' already exists');
191 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000192 };
Akron941c1a62016-02-23 17:41:41 +0100193
194 BEGIN {
195 $main::TIME = Benchmark->new;
196 $main::LAST_STOP = Benchmark->new;
197 };
198
199 sub stop_time {
200 my $new = Benchmark->new;
201 $log->trace(
202 'The code took: '.
203 timestr(timediff($new, $main::LAST_STOP)) .
204 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
205 );
206 $main::LAST_STOP = $new;
207 };
208
209 # Create and parse new document
210 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100211 my $doc = KorAP::XML::Krill->new(
212 path => $input,
Akron11c80302016-03-18 19:44:43 +0100213 meta_type => ($meta // 'I5'),
214 cache => Cache::FastMmap->new(
215 share_file => $cache_file,
216 cache_size => $cache_size,
217 init_file => $cache_init
218 )
Akron35db6e32016-03-17 22:42:22 +0100219 );
Akron941c1a62016-02-23 17:41:41 +0100220
221 unless ($doc->parse) {
222 $log->warn($output . " can't be processed - no document data");
223 exit(0);
224 };
225
226 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
227 if ($token_base) {
228 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
229 };
230
231 # Get tokenization
232 my $tokens = KorAP::XML::Tokenizer->new(
233 path => $doc->path,
234 doc => $doc,
235 foundry => $token_base_foundry,
236 layer => $token_base_layer,
237 name => 'tokens'
238 );
239
240 # Unable to process base tokenization
241 unless ($tokens->parse) {
242 $log->error($output . " can't be processed - no base tokenization");
243 exit(0);
244 };
245
246 my @layers;
247 push(@layers, ['Base', 'Sentences']);
248 push(@layers, ['Base', 'Paragraphs']);
249
250 # Connexor
251 push(@layers, ['Connexor', 'Morpho']);
252 push(@layers, ['Connexor', 'Syntax']);
253 push(@layers, ['Connexor', 'Phrase']);
254 push(@layers, ['Connexor', 'Sentences']);
255
256 # CoreNLP
257 push(@layers, ['CoreNLP', 'NamedEntities']);
258 push(@layers, ['CoreNLP', 'Sentences']);
259 push(@layers, ['CoreNLP', 'Morpho']);
260 push(@layers, ['CoreNLP', 'Constituency']);
261
262 # DeReKo
263 push(@layers, ['DeReKo', 'Structure']);
264
265 # Glemm
266 push(@layers, ['Glemm', 'Morpho']);
267
268 # Malt
269 # push(@layers, ['Malt', 'Dependency']);
270
271 # Mate
272 push(@layers, ['Mate', 'Morpho']);
273 push(@layers, ['Mate', 'Dependency']);
274
275 # OpenNLP
276 push(@layers, ['OpenNLP', 'Morpho']);
277 push(@layers, ['OpenNLP', 'Sentences']);
278
279 # Schreibgebrauch
280 push(@layers, ['Sgbr', 'Lemma']);
281 push(@layers, ['Sgbr', 'Morpho']);
282
283 # TreeTagger
284 push(@layers, ['TreeTagger', 'Morpho']);
285 push(@layers, ['TreeTagger', 'Sentences']);
286
287 # XIP
288 push(@layers, ['XIP', 'Morpho']);
289 push(@layers, ['XIP', 'Constituency']);
290 push(@layers, ['XIP', 'Sentences']);
291 push(@layers, ['XIP', 'Dependency']);
292
293
294 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100295 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100296 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000297 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000298 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000299 }
300 else {
Akron941c1a62016-02-23 17:41:41 +0100301 # Add to index file - respect skipping
302 foreach my $info (@layers) {
303 # Skip if Foundry or Foundry#Layer should be skipped
304 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
305 $tokens->add(@$info);
306 stop_time;
307 };
308 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000309 };
310
Akron941c1a62016-02-23 17:41:41 +0100311 my $file;
Akron35db6e32016-03-17 22:42:22 +0100312 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100313
314 if ($output) {
315
316 if ($gzip) {
317 $file = IO::Compress::Gzip->new($output, Minimal => 1);
318 }
319 else {
320 $file = IO::File->new($output, "w");
321 };
322
323 $file->print($print_text);
324 $file->close;
325 }
326
327 else {
328 print $print_text . "\n";
329 };
330
Akron11c80302016-03-18 19:44:43 +0100331 # Delete cache file
332 unlink($cache_file) if $cache_delete;
333
Akron941c1a62016-02-23 17:41:41 +0100334 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000335}
Nils Diewald59094f22014-11-05 18:20:50 +0000336
Akrone10ad322016-02-27 10:54:26 +0100337# Extract XML files
338elsif ($cmd eq 'extract') {
339
Akron08385f62016-03-22 20:37:04 +0100340 my $input = $input[0];
341
Akrone10ad322016-02-27 10:54:26 +0100342 pod2usage(%ERROR_HASH) unless $output;
343
344 # TODO: Support sigles and full archives
345
346 if ($output && (!-e $output || !-d $output)) {
347 print "Directory '$output' does not exist.\n\n";
348 exit(0);
349 };
350
Akron08385f62016-03-22 20:37:04 +0100351#TODOOOOOO
352
Akrone10ad322016-02-27 10:54:26 +0100353 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
354
355 unless ($archive->test_unzip) {
356 print "Unzip is not installed or incompatible.\n\n";
357 exit(1);
358 };
359
Akrone10ad322016-02-27 10:54:26 +0100360 # Iterate over all given sigles and extract
361 foreach (@sigle) {
362 print "$_ ";
363 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
364 print "extracted.\n";
365 };
366
367 print "\n";
368 exit(1);
369 };
370}
371
Akron941c1a62016-02-23 17:41:41 +0100372# Process an archive
373elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000374
Akrone10ad322016-02-27 10:54:26 +0100375 # TODO: Support sigles
376
Akron941c1a62016-02-23 17:41:41 +0100377 pod2usage(%ERROR_HASH) unless $output;
378
379 if ($output && (!-e $output || !-d $output)) {
380 print "Directory '$output' does not exist.\n\n";
381 exit(0);
382 };
383
384 # Zero means: everything runs in the parent process
385 my $pool = Parallel::ForkManager->new($jobs);
386
387 my $count = 0; # Texts to process
388 my $iter = 1; # Current text in process
389
390 # Report on fork message
391 $pool->run_on_finish (
392 sub {
393 my ($pid, $code) = shift;
394 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100395 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100396 ($iter++) . "/$count]" .
397 ($code ? " $code" : '') .
398 " $$data\n";
399 }
400 );
401
402 my $t;
403 print "Reading data ...\n";
404
Akron11c80302016-03-18 19:44:43 +0100405 unless (Cache::FastMmap->new(
406 share_file => $cache_file,
407 cache_size => $cache_size,
408 init_file => $cache_init
409 )) {
410 print "Unable to intialize cache '$cache_file'\n\n";
411 exit(1);
412 };
413
Akron941c1a62016-02-23 17:41:41 +0100414 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100415 if (-d $input[0]) {
416 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100417 my @dirs;
418 my $dir;
419
420 while (1) {
421 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
422 push @dirs, $dir;
423 $it->prune;
424 };
425 last unless $it->next;
426 };
427
428 print "Start processing ...\n";
429 $t = Benchmark->new;
430 $count = scalar @dirs;
431
432 DIRECTORY_LOOP:
433 for (my $i = 0; $i < $count; $i++) {
434
435 unless ($overwrite) {
436 my $filename = catfile(
437 $output,
438 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
439 );
440
441 if (-e $filename) {
442 $iter++;
443 print "Skip $filename\n";
444 next;
445 };
446 };
447
448 # Get the next fork
449 my $pid = $pool->start and next DIRECTORY_LOOP;
450 my $msg;
451
452 $msg = write_file($dirs[$i]);
453 $pool->finish(0, \$msg);
454 };
455 }
456
457 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200458 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron941c1a62016-02-23 17:41:41 +0100459 unless ($archive->test_unzip) {
460 print "Unzip is not installed or incompatible.\n\n";
461 exit(1);
462 };
463
Akron08385f62016-03-22 20:37:04 +0100464 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200465 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100466
Akron941c1a62016-02-23 17:41:41 +0100467 print "Start processing ...\n";
468 $t = Benchmark->new;
469 my @dirs = $archive->list_texts;
470 $count = scalar @dirs;
471
472 ARCHIVE_LOOP:
473 for (my $i = 0; $i < $count; $i++) {
474
475 # Split path information
476 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
477
478 unless ($overwrite) {
479 my $filename = catfile(
480 $output,
481 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
482 );
483
484 if (-e $filename) {
485 $iter++;
486 print "Skip $filename\n";
487 next;
488 };
489 };
490
491 # Get the next fork
492 my $pid = $pool->start and next ARCHIVE_LOOP;
493
494 # Create temporary file
495 my $temp = File::Temp->newdir;
496
497 my $msg;
498
499 # Extract from archive
500 if ($archive->extract($dirs[$i], $temp)) {
501
502 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100503 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100504
505 # Temporary directory
506 my $dir = catdir($input, $doc, $text);
507
508 # Write file
509 $msg = write_file($dir);
510
511 $temp = undef;
512 $pool->finish(0, \$msg);
513 }
514 else {
515
516 $temp = undef;
517 $msg = "Unable to extract " . $dirs[$i] . "\n";
518 $pool->finish(1, \$msg);
519 };
520 };
521 }
522
523 else {
524 print "Input is neither a directory nor an archive.\n\n";
525 };
526
527 $pool->wait_all_children;
528
Akron11c80302016-03-18 19:44:43 +0100529 # Delete cache file
530 unlink($cache_file) if $cache_delete;
531
Akron941c1a62016-02-23 17:41:41 +0100532 print "Done.\n";
533 print timestr(timediff(Benchmark->new, $t))."\n\n";
534}
535
536# Unknown command
537else {
538 warn "Unknown command '$cmd'.\n\n";
539 pod2usage(%ERROR_HASH);
540}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000541
542__END__
Akron941c1a62016-02-23 17:41:41 +0100543
544=pod
545
546=encoding utf8
547
548=head1 NAME
549
Akronf7ad89e2016-03-16 18:22:47 +0100550korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100551
552
553=head1 SYNOPSIS
554
Akronc13a1702016-03-15 19:33:14 +0100555 $ korapxml2krill -z --input <directory> --output <filename>
556 $ korapxml2krill archive -z --input <directory> --output <directory>
557 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100558
559
560=head1 DESCRIPTION
561
562L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
563compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100564The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100565
566
567=head1 INSTALLATION
568
569The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
570
571 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
572
Akronc13a1702016-03-15 19:33:14 +0100573In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100574be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100575
576
577=head1 ARGUMENTS
578
579=over 2
580
581=item B<archive>
582
Akrone10ad322016-02-27 10:54:26 +0100583Process an archive as a Zip-file or a folder of KorAP-XML documents.
584
585=item B<extract>
586
587Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100588
589=back
590
591
592=head1 OPTIONS
593
594=over 2
595
Akron2cfe8092016-06-24 17:48:49 +0200596=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100597
Akronf7ad89e2016-03-16 18:22:47 +0100598Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100599
Akron2cfe8092016-06-24 17:48:49 +0200600Multiple input archives are supported for archiving,
601with the constraint,
602that the first archive listed contains all primary data files
603and all meta data files.
604The directory structure follows the base directory format,
605starting with a C<.> root folder.
606In case an attached archive has no C<.> root folder,
607the archive path should start with a hash.
608
609 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
610
611
Akron941c1a62016-02-23 17:41:41 +0100612=item B<--output|-o> <directory|file>
613
614Output folder for archive processing or
615document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100616writes to C<STDOUT> by default
617(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100618
619=item B<--overwrite|-w>
620
621Overwrite files that already exist.
622
623=item B<--token|-t> <foundry>[#<file>]
624
625Define the default tokenization by specifying
626the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100627of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100628
629=item B<--skip|-s> <foundry>[#<layer>]
630
Akronf7ad89e2016-03-16 18:22:47 +0100631Skip specific annotations by specifying the foundry
632(and optionally the layer with a C<#>-prefix),
633e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100634Can be set multiple times.
635
Akronc13a1702016-03-15 19:33:14 +0100636=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100637
Akronf7ad89e2016-03-16 18:22:47 +0100638Convert specific annotations by specifying the foundry
639(and optionally the layer with a C<#>-prefix),
640e.g. C<Mate> or C<Mate#Morpho>.
641Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100642
643=item B<--primary|-p>
644
Akronc13a1702016-03-15 19:33:14 +0100645Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100646Can be flagged using C<--no-primary> as well.
647This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100648
649=item B<--jobs|-j>
650
651Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100652for archive processing.
Akron11c80302016-03-18 19:44:43 +0100653Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100654This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100655
Akron35db6e32016-03-17 22:42:22 +0100656=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100657
Akron35db6e32016-03-17 22:42:22 +0100658Define the metadata parser to use. Defaults to C<I5>.
659Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
660This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100661
662=item B<--pretty|-y>
663
Akronc13a1702016-03-15 19:33:14 +0100664Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100665This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100666
667=item B<--gzip|-z>
668
Akronf7ad89e2016-03-16 18:22:47 +0100669Compress the output.
670Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100671
Akron11c80302016-03-18 19:44:43 +0100672=item B<--cache|-c>
673
674File to mmap a cache (using L<Cache::FastMmap>).
675Defaults to C<korapxml2krill.cache> in the calling directory.
676
677=item B<--cache-size|-cs>
678
679Size of the cache. Defaults to C<50m>.
680
681=item B<--cache-init|-ci>
682
683Initialize cache file.
684Can be flagged using C<--no-cache-init> as well.
685Defaults to C<true>.
686
687=item B<--cache-delete|-cd>
688
689Delete cache file after processing.
690Can be flagged using C<--no-cache-delete> as well.
691Defaults to C<true>.
692
Akrone10ad322016-02-27 10:54:26 +0100693=item B<--sigle|-sg>
694
695Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100696Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100697I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100698
Akron941c1a62016-02-23 17:41:41 +0100699=item B<--log|-l>
700
701The L<Log4perl> log level, defaults to C<ERROR>.
702
703=item B<--help|-h>
704
705Print this document.
706
707=item B<--version|-v>
708
709Print version information.
710
711=back
712
Akronc13a1702016-03-15 19:33:14 +0100713=head1 ANNOTATION SUPPORT
714
715L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
716developed in the KorAP project that are part of the KorAP preprocessing pipeline.
717The base foundry with paragraphs, sentences, and the text element are mandatory for
718L<Krill|https://github.com/KorAP/Krill>.
719
Akronf7ad89e2016-03-16 18:22:47 +0100720=over 2
Akronc13a1702016-03-15 19:33:14 +0100721
722=item B<Base>
723
724=over 4
725
Akronf7ad89e2016-03-16 18:22:47 +0100726=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100727
Akronf7ad89e2016-03-16 18:22:47 +0100728=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100729
730=back
731
732=item B<Connexor>
733
734=over 4
735
Akronf7ad89e2016-03-16 18:22:47 +0100736=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100737
Akronf7ad89e2016-03-16 18:22:47 +0100738=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100739
Akronf7ad89e2016-03-16 18:22:47 +0100740=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100741
Akronf7ad89e2016-03-16 18:22:47 +0100742=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100743
744=back
745
746=item B<CoreNLP>
747
748=over 4
749
Akronf7ad89e2016-03-16 18:22:47 +0100750=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100751
Akronf7ad89e2016-03-16 18:22:47 +0100752=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100753
Akronf7ad89e2016-03-16 18:22:47 +0100754=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100755
Akronf7ad89e2016-03-16 18:22:47 +0100756=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100757
758=back
759
760=item B<DeReKo>
761
762=over 4
763
Akronf7ad89e2016-03-16 18:22:47 +0100764=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100765
766=back
767
768=item B<Glemm>
769
770=over 4
771
Akronf7ad89e2016-03-16 18:22:47 +0100772=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100773
774=back
775
776=item B<Mate>
777
778=over 4
779
Akronf7ad89e2016-03-16 18:22:47 +0100780=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100781
Akronf7ad89e2016-03-16 18:22:47 +0100782=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100783
784=back
785
786=item B<OpenNLP>
787
788=over 4
789
Akronf7ad89e2016-03-16 18:22:47 +0100790=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100791
Akronf7ad89e2016-03-16 18:22:47 +0100792=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100793
794=back
795
796=item B<Sgbr>
797
798=over 4
799
Akronf7ad89e2016-03-16 18:22:47 +0100800=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100801
Akronf7ad89e2016-03-16 18:22:47 +0100802=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100803
804=back
805
806=item B<TreeTagger>
807
808=over 4
809
Akronf7ad89e2016-03-16 18:22:47 +0100810=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100811
Akronf7ad89e2016-03-16 18:22:47 +0100812=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100813
814=back
815
816=item B<XIP>
817
818=over 4
819
Akronf7ad89e2016-03-16 18:22:47 +0100820=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100821
Akronf7ad89e2016-03-16 18:22:47 +0100822=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100823
Akronf7ad89e2016-03-16 18:22:47 +0100824=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100825
826=back
827
828=back
829
830More importers are in preparation.
831New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
832See the built-in annotation importers as examples.
833
Akron941c1a62016-02-23 17:41:41 +0100834=head1 AVAILABILITY
835
836 https://github.com/KorAP/KorAP-XML-Krill
837
838
839=head1 COPYRIGHT AND LICENSE
840
841Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100842
Akron941c1a62016-02-23 17:41:41 +0100843Author: L<Nils Diewald|http://nils-diewald.de/>
844
845L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
846Corpus Analysis Platform at the
847L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
848member of the
849L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
850
851This program is free software published under the
852L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
853
854=cut