blob: 0efdd3a36342ffd405a37945ea750d500a36af3d [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010019# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010020
Akron941c1a62016-02-23 17:41:41 +010021# CHANGES:
22# ----------------------------------------------------------
23# 2013/11/25
24# - Initial release
25#
26# 2014/10/29
27# - Merges foundry data to create indexer friendly documents
28#
Akron93d620e2016-02-05 19:40:05 +010029# 2016/02/04
30# - renamed to korapxml2krill
31# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010032#
33# 2016/02/12
34# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010035# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010036#
37# 2016/02/14
38# - Added version information
Akron941c1a62016-02-23 17:41:41 +010039# - Added support for archive files
40#
41# 2016/02/15
42# - Fixed temporary directory bug
43# - Improved skipping before unzipping
44# - Added EXPERIMENTAL concurrency support
45#
46# 2016/02/23
47# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010048#
49# 2016/02/27
50# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010051#
52# 2016/03/17
53# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010054#
55# 2016/03/18
56# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020057#
Akronf3f0c942016-06-27 13:27:14 +020058# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020059# - Added multi archive support
60# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020061# - Added Malt#Dependency support
Akron941c1a62016-02-23 17:41:41 +010062# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010063
Akron35db6e32016-03-17 22:42:22 +010064our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010065our $LOCAL = $FindBin::Bin;
66our $VERSION_MSG = <<"VERSION";
67Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
68VERSION
69
70
71# Parse comand
72my $cmd;
73our @ARGV;
74if ($ARGV[0] && index($ARGV[0], '-') != 0) {
75 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010076};
Akron93d620e2016-02-05 19:40:05 +010077
Akron08385f62016-03-22 20:37:04 +010078my (@skip, @sigle, @input);
Akron35db6e32016-03-17 22:42:22 +010079my $text;
Akrone10ad322016-02-27 10:54:26 +010080
Akron941c1a62016-02-23 17:41:41 +010081# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000082GetOptions(
Akron08385f62016-03-22 20:37:04 +010083 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010084 'output|o=s' => \(my $output),
85 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010086 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010087 'token|t=s' => \(my $token_base),
88 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010089 'skip|s=s' => \@skip,
90 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010091 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
92 'cache-size|cs=s' => \(my $cache_size = '50m'),
93 'cache-delete|cd!' => \(my $cache_delete = 1),
94 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010095 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010096 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010097 'primary|p!' => \(my $primary),
98 'pretty|y' => \(my $pretty),
99 'jobs|j=i' => \(my $jobs = 0),
100 'help|h' => sub {
101 pod2usage(
102 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
103 -verbose => 99,
104 -msg => $VERSION_MSG,
105 );
106 },
107 'version|v' => sub {
108 pod2usage(
109 -verbose => 0,
110 -msg => $VERSION_MSG
111 )
112 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000113);
114
Akron941c1a62016-02-23 17:41:41 +0100115my %ERROR_HASH = (
116 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
117 -verbose => 99,
118 -msg => $VERSION_MSG,
119 -exit => 1
120);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000121
Akron941c1a62016-02-23 17:41:41 +0100122# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100123pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000124
Nils Diewald7364d1f2013-11-05 19:26:35 +0000125
Akron941c1a62016-02-23 17:41:41 +0100126# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127Log::Log4perl->init({
128 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
129 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
130 'log4perl.appender.STDERR.layout' => 'PatternLayout',
131 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
132});
133
134my $log = Log::Log4perl->get_logger('main');
135
Akron941c1a62016-02-23 17:41:41 +0100136
137# Get file name based on path information
138sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100139 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100140 my $file = shift;
Akron08385f62016-03-22 20:37:04 +0100141 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100142 $file =~ tr/\//-/;
143 $file =~ s{^-+}{};
144 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000145};
146
Akron941c1a62016-02-23 17:41:41 +0100147
148# Write file
149sub write_file {
150 my $anno = shift;
151 my $file = get_file_name $anno;
152
153 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
154
Akron08385f62016-03-22 20:37:04 +0100155 my $call = 'perl ' . $LOCAL . '/korapxml2krill';
156 $call .= ' -i ' . $anno;
157 $call .= ' -o ' . $output . '/' . $file . '.json';
Akron941c1a62016-02-23 17:41:41 +0100158 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100159 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100160 $call .= ' -w' if $overwrite;
161 $call .= ' -t ' . $token_base if $token_base;
162 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100163 $call .= ' -c ' . $cache_file;
164 $call .= ' -cs ' . $cache_size;
165 $call .= ' --no-cache-delete'; # Don't delete the cache
166 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100167 $call .= ' --no-primary ' if $primary;
168 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100169 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100170 $call .= ' -s ' . $_ foreach @skip;
171 system($call);
172 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000173};
174
Nils Diewald2db9ad02013-10-29 19:26:43 +0000175
Akrone10ad322016-02-27 10:54:26 +0100176# Convert sigle to path construct
177s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
178
Akron941c1a62016-02-23 17:41:41 +0100179# Process a single file
180unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100181 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000182
Akron941c1a62016-02-23 17:41:41 +0100183 # Can't print gzip to STDOUT
184 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000185
Akron941c1a62016-02-23 17:41:41 +0100186 my %skip;
187 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000188
Akron941c1a62016-02-23 17:41:41 +0100189 # Ignore processing
190 if (!$overwrite && $output && -e $output) {
191 $log->trace($output . ' already exists');
192 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000193 };
Akron941c1a62016-02-23 17:41:41 +0100194
195 BEGIN {
196 $main::TIME = Benchmark->new;
197 $main::LAST_STOP = Benchmark->new;
198 };
199
200 sub stop_time {
201 my $new = Benchmark->new;
202 $log->trace(
203 'The code took: '.
204 timestr(timediff($new, $main::LAST_STOP)) .
205 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
206 );
207 $main::LAST_STOP = $new;
208 };
209
210 # Create and parse new document
211 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100212 my $doc = KorAP::XML::Krill->new(
213 path => $input,
Akron11c80302016-03-18 19:44:43 +0100214 meta_type => ($meta // 'I5'),
215 cache => Cache::FastMmap->new(
216 share_file => $cache_file,
217 cache_size => $cache_size,
218 init_file => $cache_init
219 )
Akron35db6e32016-03-17 22:42:22 +0100220 );
Akron941c1a62016-02-23 17:41:41 +0100221
222 unless ($doc->parse) {
223 $log->warn($output . " can't be processed - no document data");
224 exit(0);
225 };
226
227 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
228 if ($token_base) {
229 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
230 };
231
232 # Get tokenization
233 my $tokens = KorAP::XML::Tokenizer->new(
234 path => $doc->path,
235 doc => $doc,
236 foundry => $token_base_foundry,
237 layer => $token_base_layer,
238 name => 'tokens'
239 );
240
241 # Unable to process base tokenization
242 unless ($tokens->parse) {
243 $log->error($output . " can't be processed - no base tokenization");
244 exit(0);
245 };
246
247 my @layers;
248 push(@layers, ['Base', 'Sentences']);
249 push(@layers, ['Base', 'Paragraphs']);
250
251 # Connexor
252 push(@layers, ['Connexor', 'Morpho']);
253 push(@layers, ['Connexor', 'Syntax']);
254 push(@layers, ['Connexor', 'Phrase']);
255 push(@layers, ['Connexor', 'Sentences']);
256
257 # CoreNLP
258 push(@layers, ['CoreNLP', 'NamedEntities']);
259 push(@layers, ['CoreNLP', 'Sentences']);
260 push(@layers, ['CoreNLP', 'Morpho']);
261 push(@layers, ['CoreNLP', 'Constituency']);
262
263 # DeReKo
264 push(@layers, ['DeReKo', 'Structure']);
265
266 # Glemm
267 push(@layers, ['Glemm', 'Morpho']);
268
269 # Malt
Akronf3f0c942016-06-27 13:27:14 +0200270 push(@layers, ['Malt', 'Dependency']);
Akron941c1a62016-02-23 17:41:41 +0100271
272 # Mate
273 push(@layers, ['Mate', 'Morpho']);
274 push(@layers, ['Mate', 'Dependency']);
275
276 # OpenNLP
277 push(@layers, ['OpenNLP', 'Morpho']);
278 push(@layers, ['OpenNLP', 'Sentences']);
279
280 # Schreibgebrauch
281 push(@layers, ['Sgbr', 'Lemma']);
282 push(@layers, ['Sgbr', 'Morpho']);
283
284 # TreeTagger
285 push(@layers, ['TreeTagger', 'Morpho']);
286 push(@layers, ['TreeTagger', 'Sentences']);
287
288 # XIP
289 push(@layers, ['XIP', 'Morpho']);
290 push(@layers, ['XIP', 'Constituency']);
291 push(@layers, ['XIP', 'Sentences']);
292 push(@layers, ['XIP', 'Dependency']);
293
294
295 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100296 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100297 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000298 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000299 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000300 }
301 else {
Akron941c1a62016-02-23 17:41:41 +0100302 # Add to index file - respect skipping
303 foreach my $info (@layers) {
304 # Skip if Foundry or Foundry#Layer should be skipped
305 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
306 $tokens->add(@$info);
307 stop_time;
308 };
309 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000310 };
311
Akron941c1a62016-02-23 17:41:41 +0100312 my $file;
Akron35db6e32016-03-17 22:42:22 +0100313 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100314
315 if ($output) {
316
317 if ($gzip) {
318 $file = IO::Compress::Gzip->new($output, Minimal => 1);
319 }
320 else {
321 $file = IO::File->new($output, "w");
322 };
323
324 $file->print($print_text);
325 $file->close;
326 }
327
328 else {
329 print $print_text . "\n";
330 };
331
Akron11c80302016-03-18 19:44:43 +0100332 # Delete cache file
333 unlink($cache_file) if $cache_delete;
334
Akron941c1a62016-02-23 17:41:41 +0100335 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000336}
Nils Diewald59094f22014-11-05 18:20:50 +0000337
Akrone10ad322016-02-27 10:54:26 +0100338# Extract XML files
339elsif ($cmd eq 'extract') {
340
Akron08385f62016-03-22 20:37:04 +0100341 my $input = $input[0];
342
Akrone10ad322016-02-27 10:54:26 +0100343 pod2usage(%ERROR_HASH) unless $output;
344
345 # TODO: Support sigles and full archives
346
347 if ($output && (!-e $output || !-d $output)) {
348 print "Directory '$output' does not exist.\n\n";
349 exit(0);
350 };
351
Akron08385f62016-03-22 20:37:04 +0100352#TODOOOOOO
353
Akrone10ad322016-02-27 10:54:26 +0100354 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
355
356 unless ($archive->test_unzip) {
357 print "Unzip is not installed or incompatible.\n\n";
358 exit(1);
359 };
360
Akrone10ad322016-02-27 10:54:26 +0100361 # Iterate over all given sigles and extract
362 foreach (@sigle) {
363 print "$_ ";
364 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
365 print "extracted.\n";
366 };
367
368 print "\n";
369 exit(1);
370 };
371}
372
Akron941c1a62016-02-23 17:41:41 +0100373# Process an archive
374elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000375
Akrone10ad322016-02-27 10:54:26 +0100376 # TODO: Support sigles
377
Akron941c1a62016-02-23 17:41:41 +0100378 pod2usage(%ERROR_HASH) unless $output;
379
380 if ($output && (!-e $output || !-d $output)) {
381 print "Directory '$output' does not exist.\n\n";
382 exit(0);
383 };
384
385 # Zero means: everything runs in the parent process
386 my $pool = Parallel::ForkManager->new($jobs);
387
388 my $count = 0; # Texts to process
389 my $iter = 1; # Current text in process
390
391 # Report on fork message
392 $pool->run_on_finish (
393 sub {
394 my ($pid, $code) = shift;
395 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100396 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100397 ($iter++) . "/$count]" .
398 ($code ? " $code" : '') .
399 " $$data\n";
400 }
401 );
402
403 my $t;
404 print "Reading data ...\n";
405
Akron11c80302016-03-18 19:44:43 +0100406 unless (Cache::FastMmap->new(
407 share_file => $cache_file,
408 cache_size => $cache_size,
409 init_file => $cache_init
410 )) {
411 print "Unable to intialize cache '$cache_file'\n\n";
412 exit(1);
413 };
414
Akron941c1a62016-02-23 17:41:41 +0100415 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100416 if (-d $input[0]) {
417 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100418 my @dirs;
419 my $dir;
420
421 while (1) {
422 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
423 push @dirs, $dir;
424 $it->prune;
425 };
426 last unless $it->next;
427 };
428
429 print "Start processing ...\n";
430 $t = Benchmark->new;
431 $count = scalar @dirs;
432
433 DIRECTORY_LOOP:
434 for (my $i = 0; $i < $count; $i++) {
435
436 unless ($overwrite) {
437 my $filename = catfile(
438 $output,
439 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
440 );
441
442 if (-e $filename) {
443 $iter++;
444 print "Skip $filename\n";
445 next;
446 };
447 };
448
449 # Get the next fork
450 my $pid = $pool->start and next DIRECTORY_LOOP;
451 my $msg;
452
453 $msg = write_file($dirs[$i]);
454 $pool->finish(0, \$msg);
455 };
456 }
457
458 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200459 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron941c1a62016-02-23 17:41:41 +0100460 unless ($archive->test_unzip) {
461 print "Unzip is not installed or incompatible.\n\n";
462 exit(1);
463 };
464
Akron08385f62016-03-22 20:37:04 +0100465 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200466 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100467
Akron941c1a62016-02-23 17:41:41 +0100468 print "Start processing ...\n";
469 $t = Benchmark->new;
470 my @dirs = $archive->list_texts;
471 $count = scalar @dirs;
472
473 ARCHIVE_LOOP:
474 for (my $i = 0; $i < $count; $i++) {
475
476 # Split path information
477 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
478
479 unless ($overwrite) {
480 my $filename = catfile(
481 $output,
482 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
483 );
484
485 if (-e $filename) {
486 $iter++;
487 print "Skip $filename\n";
488 next;
489 };
490 };
491
492 # Get the next fork
493 my $pid = $pool->start and next ARCHIVE_LOOP;
494
495 # Create temporary file
496 my $temp = File::Temp->newdir;
497
498 my $msg;
499
500 # Extract from archive
501 if ($archive->extract($dirs[$i], $temp)) {
502
503 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100504 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100505
506 # Temporary directory
507 my $dir = catdir($input, $doc, $text);
508
509 # Write file
510 $msg = write_file($dir);
511
512 $temp = undef;
513 $pool->finish(0, \$msg);
514 }
515 else {
516
517 $temp = undef;
518 $msg = "Unable to extract " . $dirs[$i] . "\n";
519 $pool->finish(1, \$msg);
520 };
521 };
522 }
523
524 else {
525 print "Input is neither a directory nor an archive.\n\n";
526 };
527
528 $pool->wait_all_children;
529
Akron11c80302016-03-18 19:44:43 +0100530 # Delete cache file
531 unlink($cache_file) if $cache_delete;
532
Akron941c1a62016-02-23 17:41:41 +0100533 print "Done.\n";
534 print timestr(timediff(Benchmark->new, $t))."\n\n";
535}
536
537# Unknown command
538else {
539 warn "Unknown command '$cmd'.\n\n";
540 pod2usage(%ERROR_HASH);
541}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000542
543__END__
Akron941c1a62016-02-23 17:41:41 +0100544
545=pod
546
547=encoding utf8
548
549=head1 NAME
550
Akronf7ad89e2016-03-16 18:22:47 +0100551korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100552
553
554=head1 SYNOPSIS
555
Akronc13a1702016-03-15 19:33:14 +0100556 $ korapxml2krill -z --input <directory> --output <filename>
557 $ korapxml2krill archive -z --input <directory> --output <directory>
558 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100559
560
561=head1 DESCRIPTION
562
563L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
564compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100565The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100566
567
568=head1 INSTALLATION
569
570The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
571
572 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
573
Akronc13a1702016-03-15 19:33:14 +0100574In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100575be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100576
577
578=head1 ARGUMENTS
579
580=over 2
581
582=item B<archive>
583
Akrone10ad322016-02-27 10:54:26 +0100584Process an archive as a Zip-file or a folder of KorAP-XML documents.
585
586=item B<extract>
587
588Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100589
590=back
591
592
593=head1 OPTIONS
594
595=over 2
596
Akron2cfe8092016-06-24 17:48:49 +0200597=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100598
Akronf7ad89e2016-03-16 18:22:47 +0100599Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100600
Akron2cfe8092016-06-24 17:48:49 +0200601Multiple input archives are supported for archiving,
602with the constraint,
603that the first archive listed contains all primary data files
604and all meta data files.
605The directory structure follows the base directory format,
606starting with a C<.> root folder.
607In case an attached archive has no C<.> root folder,
608the archive path should start with a hash.
609
610 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
611
612
Akron941c1a62016-02-23 17:41:41 +0100613=item B<--output|-o> <directory|file>
614
615Output folder for archive processing or
616document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100617writes to C<STDOUT> by default
618(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100619
620=item B<--overwrite|-w>
621
622Overwrite files that already exist.
623
624=item B<--token|-t> <foundry>[#<file>]
625
626Define the default tokenization by specifying
627the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100628of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100629
630=item B<--skip|-s> <foundry>[#<layer>]
631
Akronf7ad89e2016-03-16 18:22:47 +0100632Skip specific annotations by specifying the foundry
633(and optionally the layer with a C<#>-prefix),
634e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100635Can be set multiple times.
636
Akronc13a1702016-03-15 19:33:14 +0100637=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100638
Akronf7ad89e2016-03-16 18:22:47 +0100639Convert specific annotations by specifying the foundry
640(and optionally the layer with a C<#>-prefix),
641e.g. C<Mate> or C<Mate#Morpho>.
642Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100643
644=item B<--primary|-p>
645
Akronc13a1702016-03-15 19:33:14 +0100646Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100647Can be flagged using C<--no-primary> as well.
648This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100649
650=item B<--jobs|-j>
651
652Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100653for archive processing.
Akron11c80302016-03-18 19:44:43 +0100654Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100655This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100656
Akron35db6e32016-03-17 22:42:22 +0100657=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100658
Akron35db6e32016-03-17 22:42:22 +0100659Define the metadata parser to use. Defaults to C<I5>.
660Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
661This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100662
663=item B<--pretty|-y>
664
Akronc13a1702016-03-15 19:33:14 +0100665Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100666This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100667
668=item B<--gzip|-z>
669
Akronf7ad89e2016-03-16 18:22:47 +0100670Compress the output.
671Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100672
Akron11c80302016-03-18 19:44:43 +0100673=item B<--cache|-c>
674
675File to mmap a cache (using L<Cache::FastMmap>).
676Defaults to C<korapxml2krill.cache> in the calling directory.
677
678=item B<--cache-size|-cs>
679
680Size of the cache. Defaults to C<50m>.
681
682=item B<--cache-init|-ci>
683
684Initialize cache file.
685Can be flagged using C<--no-cache-init> as well.
686Defaults to C<true>.
687
688=item B<--cache-delete|-cd>
689
690Delete cache file after processing.
691Can be flagged using C<--no-cache-delete> as well.
692Defaults to C<true>.
693
Akrone10ad322016-02-27 10:54:26 +0100694=item B<--sigle|-sg>
695
696Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100697Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100698I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100699
Akron941c1a62016-02-23 17:41:41 +0100700=item B<--log|-l>
701
702The L<Log4perl> log level, defaults to C<ERROR>.
703
704=item B<--help|-h>
705
706Print this document.
707
708=item B<--version|-v>
709
710Print version information.
711
712=back
713
Akronc13a1702016-03-15 19:33:14 +0100714=head1 ANNOTATION SUPPORT
715
716L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
717developed in the KorAP project that are part of the KorAP preprocessing pipeline.
718The base foundry with paragraphs, sentences, and the text element are mandatory for
719L<Krill|https://github.com/KorAP/Krill>.
720
Akronf7ad89e2016-03-16 18:22:47 +0100721=over 2
Akronc13a1702016-03-15 19:33:14 +0100722
723=item B<Base>
724
725=over 4
726
Akronf7ad89e2016-03-16 18:22:47 +0100727=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100728
Akronf7ad89e2016-03-16 18:22:47 +0100729=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100730
731=back
732
733=item B<Connexor>
734
735=over 4
736
Akronf7ad89e2016-03-16 18:22:47 +0100737=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100738
Akronf7ad89e2016-03-16 18:22:47 +0100739=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100740
Akronf7ad89e2016-03-16 18:22:47 +0100741=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100742
Akronf7ad89e2016-03-16 18:22:47 +0100743=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100744
745=back
746
747=item B<CoreNLP>
748
749=over 4
750
Akronf7ad89e2016-03-16 18:22:47 +0100751=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100752
Akronf7ad89e2016-03-16 18:22:47 +0100753=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100754
Akronf7ad89e2016-03-16 18:22:47 +0100755=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100756
Akronf7ad89e2016-03-16 18:22:47 +0100757=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100758
759=back
760
761=item B<DeReKo>
762
763=over 4
764
Akronf7ad89e2016-03-16 18:22:47 +0100765=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100766
767=back
768
769=item B<Glemm>
770
771=over 4
772
Akronf7ad89e2016-03-16 18:22:47 +0100773=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100774
775=back
776
777=item B<Mate>
778
779=over 4
780
Akronf7ad89e2016-03-16 18:22:47 +0100781=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100782
Akronf7ad89e2016-03-16 18:22:47 +0100783=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100784
785=back
786
787=item B<OpenNLP>
788
789=over 4
790
Akronf7ad89e2016-03-16 18:22:47 +0100791=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100792
Akronf7ad89e2016-03-16 18:22:47 +0100793=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100794
795=back
796
797=item B<Sgbr>
798
799=over 4
800
Akronf7ad89e2016-03-16 18:22:47 +0100801=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100802
Akronf7ad89e2016-03-16 18:22:47 +0100803=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100804
805=back
806
807=item B<TreeTagger>
808
809=over 4
810
Akronf7ad89e2016-03-16 18:22:47 +0100811=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100812
Akronf7ad89e2016-03-16 18:22:47 +0100813=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100814
815=back
816
817=item B<XIP>
818
819=over 4
820
Akronf7ad89e2016-03-16 18:22:47 +0100821=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100822
Akronf7ad89e2016-03-16 18:22:47 +0100823=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100824
Akronf7ad89e2016-03-16 18:22:47 +0100825=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100826
827=back
828
829=back
830
831More importers are in preparation.
832New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
833See the built-in annotation importers as examples.
834
Akron941c1a62016-02-23 17:41:41 +0100835=head1 AVAILABILITY
836
837 https://github.com/KorAP/KorAP-XML-Krill
838
839
840=head1 COPYRIGHT AND LICENSE
841
842Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100843
Akron941c1a62016-02-23 17:41:41 +0100844Author: L<Nils Diewald|http://nils-diewald.de/>
845
846L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
847Corpus Analysis Platform at the
848L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
849member of the
850L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
851
852This program is free software published under the
853L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
854
855=cut