blob: 7e729e0756df8f0119e84f600e3ed0be3832859d [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010019# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010020
Akron941c1a62016-02-23 17:41:41 +010021# CHANGES:
22# ----------------------------------------------------------
23# 2013/11/25
24# - Initial release
25#
26# 2014/10/29
27# - Merges foundry data to create indexer friendly documents
28#
Akron93d620e2016-02-05 19:40:05 +010029# 2016/02/04
30# - renamed to korapxml2krill
31# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010032#
33# 2016/02/12
34# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010035# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010036#
37# 2016/02/14
38# - Added version information
Akron941c1a62016-02-23 17:41:41 +010039# - Added support for archive files
40#
41# 2016/02/15
42# - Fixed temporary directory bug
43# - Improved skipping before unzipping
44# - Added EXPERIMENTAL concurrency support
45#
46# 2016/02/23
47# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010048#
49# 2016/02/27
50# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010051#
52# 2016/03/17
53# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010054#
55# 2016/03/18
56# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020057#
Akronf3f0c942016-06-27 13:27:14 +020058# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020059# - Added multi archive support
60# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020061# - Added Malt#Dependency support
Akron941c1a62016-02-23 17:41:41 +010062# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010063
Akron35db6e32016-03-17 22:42:22 +010064our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010065our $LOCAL = $FindBin::Bin;
66our $VERSION_MSG = <<"VERSION";
67Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
68VERSION
69
70
71# Parse comand
72my $cmd;
73our @ARGV;
74if ($ARGV[0] && index($ARGV[0], '-') != 0) {
75 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010076};
Akron93d620e2016-02-05 19:40:05 +010077
Akron08385f62016-03-22 20:37:04 +010078my (@skip, @sigle, @input);
Akron35db6e32016-03-17 22:42:22 +010079my $text;
Akrone10ad322016-02-27 10:54:26 +010080
Akron941c1a62016-02-23 17:41:41 +010081# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000082GetOptions(
Akron08385f62016-03-22 20:37:04 +010083 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010084 'output|o=s' => \(my $output),
85 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010086 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010087 'token|t=s' => \(my $token_base),
88 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010089 'skip|s=s' => \@skip,
90 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010091 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
92 'cache-size|cs=s' => \(my $cache_size = '50m'),
93 'cache-delete|cd!' => \(my $cache_delete = 1),
94 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010095 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010096 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010097 'primary|p!' => \(my $primary),
98 'pretty|y' => \(my $pretty),
99 'jobs|j=i' => \(my $jobs = 0),
100 'help|h' => sub {
101 pod2usage(
102 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
103 -verbose => 99,
104 -msg => $VERSION_MSG,
105 );
106 },
107 'version|v' => sub {
108 pod2usage(
109 -verbose => 0,
110 -msg => $VERSION_MSG
111 )
112 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000113);
114
Akron941c1a62016-02-23 17:41:41 +0100115my %ERROR_HASH = (
116 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
117 -verbose => 99,
118 -msg => $VERSION_MSG,
119 -exit => 1
120);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000121
Akron941c1a62016-02-23 17:41:41 +0100122# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100123pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000124
Nils Diewald7364d1f2013-11-05 19:26:35 +0000125
Akron941c1a62016-02-23 17:41:41 +0100126# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127Log::Log4perl->init({
128 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
129 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
130 'log4perl.appender.STDERR.layout' => 'PatternLayout',
131 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
132});
133
134my $log = Log::Log4perl->get_logger('main');
135
Akron941c1a62016-02-23 17:41:41 +0100136
137# Get file name based on path information
138sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100139 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100140 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200141 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100142 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100143 $file =~ tr/\//-/;
144 $file =~ s{^-+}{};
145 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000146};
147
Akron941c1a62016-02-23 17:41:41 +0100148
149# Write file
150sub write_file {
151 my $anno = shift;
152 my $file = get_file_name $anno;
153
154 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
155
Akron08385f62016-03-22 20:37:04 +0100156 my $call = 'perl ' . $LOCAL . '/korapxml2krill';
157 $call .= ' -i ' . $anno;
158 $call .= ' -o ' . $output . '/' . $file . '.json';
Akron941c1a62016-02-23 17:41:41 +0100159 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100160 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100161 $call .= ' -w' if $overwrite;
162 $call .= ' -t ' . $token_base if $token_base;
163 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100164 $call .= ' -c ' . $cache_file;
165 $call .= ' -cs ' . $cache_size;
166 $call .= ' --no-cache-delete'; # Don't delete the cache
167 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100168 $call .= ' --no-primary ' if $primary;
169 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100170 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100171 $call .= ' -s ' . $_ foreach @skip;
172 system($call);
173 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000174};
175
Nils Diewald2db9ad02013-10-29 19:26:43 +0000176
Akrone10ad322016-02-27 10:54:26 +0100177# Convert sigle to path construct
178s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
179
Akron941c1a62016-02-23 17:41:41 +0100180# Process a single file
181unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100182 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000183
Akron941c1a62016-02-23 17:41:41 +0100184 # Can't print gzip to STDOUT
185 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000186
Akron941c1a62016-02-23 17:41:41 +0100187 my %skip;
188 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000189
Akron941c1a62016-02-23 17:41:41 +0100190 # Ignore processing
191 if (!$overwrite && $output && -e $output) {
192 $log->trace($output . ' already exists');
193 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000194 };
Akron941c1a62016-02-23 17:41:41 +0100195
196 BEGIN {
197 $main::TIME = Benchmark->new;
198 $main::LAST_STOP = Benchmark->new;
199 };
200
201 sub stop_time {
202 my $new = Benchmark->new;
203 $log->trace(
204 'The code took: '.
205 timestr(timediff($new, $main::LAST_STOP)) .
206 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
207 );
208 $main::LAST_STOP = $new;
209 };
210
211 # Create and parse new document
212 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100213 my $doc = KorAP::XML::Krill->new(
214 path => $input,
Akron11c80302016-03-18 19:44:43 +0100215 meta_type => ($meta // 'I5'),
216 cache => Cache::FastMmap->new(
217 share_file => $cache_file,
218 cache_size => $cache_size,
219 init_file => $cache_init
220 )
Akron35db6e32016-03-17 22:42:22 +0100221 );
Akron941c1a62016-02-23 17:41:41 +0100222
223 unless ($doc->parse) {
224 $log->warn($output . " can't be processed - no document data");
225 exit(0);
226 };
227
228 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
229 if ($token_base) {
230 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
231 };
232
233 # Get tokenization
234 my $tokens = KorAP::XML::Tokenizer->new(
235 path => $doc->path,
236 doc => $doc,
237 foundry => $token_base_foundry,
238 layer => $token_base_layer,
239 name => 'tokens'
240 );
241
242 # Unable to process base tokenization
243 unless ($tokens->parse) {
244 $log->error($output . " can't be processed - no base tokenization");
245 exit(0);
246 };
247
248 my @layers;
249 push(@layers, ['Base', 'Sentences']);
250 push(@layers, ['Base', 'Paragraphs']);
251
252 # Connexor
253 push(@layers, ['Connexor', 'Morpho']);
254 push(@layers, ['Connexor', 'Syntax']);
255 push(@layers, ['Connexor', 'Phrase']);
256 push(@layers, ['Connexor', 'Sentences']);
257
258 # CoreNLP
259 push(@layers, ['CoreNLP', 'NamedEntities']);
260 push(@layers, ['CoreNLP', 'Sentences']);
261 push(@layers, ['CoreNLP', 'Morpho']);
262 push(@layers, ['CoreNLP', 'Constituency']);
263
264 # DeReKo
265 push(@layers, ['DeReKo', 'Structure']);
266
267 # Glemm
268 push(@layers, ['Glemm', 'Morpho']);
269
270 # Malt
Akronf3f0c942016-06-27 13:27:14 +0200271 push(@layers, ['Malt', 'Dependency']);
Akron941c1a62016-02-23 17:41:41 +0100272
273 # Mate
274 push(@layers, ['Mate', 'Morpho']);
275 push(@layers, ['Mate', 'Dependency']);
276
277 # OpenNLP
278 push(@layers, ['OpenNLP', 'Morpho']);
279 push(@layers, ['OpenNLP', 'Sentences']);
280
281 # Schreibgebrauch
282 push(@layers, ['Sgbr', 'Lemma']);
283 push(@layers, ['Sgbr', 'Morpho']);
284
285 # TreeTagger
286 push(@layers, ['TreeTagger', 'Morpho']);
287 push(@layers, ['TreeTagger', 'Sentences']);
288
289 # XIP
290 push(@layers, ['XIP', 'Morpho']);
291 push(@layers, ['XIP', 'Constituency']);
292 push(@layers, ['XIP', 'Sentences']);
293 push(@layers, ['XIP', 'Dependency']);
294
295
296 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100297 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100298 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000299 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000300 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000301 }
302 else {
Akron941c1a62016-02-23 17:41:41 +0100303 # Add to index file - respect skipping
304 foreach my $info (@layers) {
305 # Skip if Foundry or Foundry#Layer should be skipped
306 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
307 $tokens->add(@$info);
308 stop_time;
309 };
310 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000311 };
312
Akron941c1a62016-02-23 17:41:41 +0100313 my $file;
Akron35db6e32016-03-17 22:42:22 +0100314 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100315
316 if ($output) {
317
318 if ($gzip) {
319 $file = IO::Compress::Gzip->new($output, Minimal => 1);
320 }
321 else {
322 $file = IO::File->new($output, "w");
323 };
324
325 $file->print($print_text);
326 $file->close;
327 }
328
329 else {
330 print $print_text . "\n";
331 };
332
Akron11c80302016-03-18 19:44:43 +0100333 # Delete cache file
334 unlink($cache_file) if $cache_delete;
335
Akron941c1a62016-02-23 17:41:41 +0100336 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000337}
Nils Diewald59094f22014-11-05 18:20:50 +0000338
Akrone10ad322016-02-27 10:54:26 +0100339# Extract XML files
340elsif ($cmd eq 'extract') {
341
Akron08385f62016-03-22 20:37:04 +0100342 my $input = $input[0];
343
Akrone10ad322016-02-27 10:54:26 +0100344 pod2usage(%ERROR_HASH) unless $output;
345
346 # TODO: Support sigles and full archives
347
348 if ($output && (!-e $output || !-d $output)) {
349 print "Directory '$output' does not exist.\n\n";
350 exit(0);
351 };
352
Akron08385f62016-03-22 20:37:04 +0100353#TODOOOOOO
354
Akrone10ad322016-02-27 10:54:26 +0100355 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
356
357 unless ($archive->test_unzip) {
358 print "Unzip is not installed or incompatible.\n\n";
359 exit(1);
360 };
361
Akrone10ad322016-02-27 10:54:26 +0100362 # Iterate over all given sigles and extract
363 foreach (@sigle) {
364 print "$_ ";
365 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
366 print "extracted.\n";
367 };
368
369 print "\n";
370 exit(1);
371 };
372}
373
Akron941c1a62016-02-23 17:41:41 +0100374# Process an archive
375elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000376
Akrone10ad322016-02-27 10:54:26 +0100377 # TODO: Support sigles
378
Akron941c1a62016-02-23 17:41:41 +0100379 pod2usage(%ERROR_HASH) unless $output;
380
381 if ($output && (!-e $output || !-d $output)) {
382 print "Directory '$output' does not exist.\n\n";
383 exit(0);
384 };
385
386 # Zero means: everything runs in the parent process
387 my $pool = Parallel::ForkManager->new($jobs);
388
389 my $count = 0; # Texts to process
390 my $iter = 1; # Current text in process
391
392 # Report on fork message
393 $pool->run_on_finish (
394 sub {
395 my ($pid, $code) = shift;
396 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100397 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100398 ($iter++) . "/$count]" .
399 ($code ? " $code" : '') .
400 " $$data\n";
401 }
402 );
403
404 my $t;
405 print "Reading data ...\n";
406
Akron11c80302016-03-18 19:44:43 +0100407 unless (Cache::FastMmap->new(
408 share_file => $cache_file,
409 cache_size => $cache_size,
410 init_file => $cache_init
411 )) {
412 print "Unable to intialize cache '$cache_file'\n\n";
413 exit(1);
414 };
415
Akron941c1a62016-02-23 17:41:41 +0100416 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100417 if (-d $input[0]) {
418 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100419 my @dirs;
420 my $dir;
421
422 while (1) {
423 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
424 push @dirs, $dir;
425 $it->prune;
426 };
427 last unless $it->next;
428 };
429
430 print "Start processing ...\n";
431 $t = Benchmark->new;
432 $count = scalar @dirs;
433
434 DIRECTORY_LOOP:
435 for (my $i = 0; $i < $count; $i++) {
436
437 unless ($overwrite) {
438 my $filename = catfile(
439 $output,
440 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
441 );
442
443 if (-e $filename) {
444 $iter++;
445 print "Skip $filename\n";
446 next;
447 };
448 };
449
450 # Get the next fork
451 my $pid = $pool->start and next DIRECTORY_LOOP;
452 my $msg;
453
454 $msg = write_file($dirs[$i]);
455 $pool->finish(0, \$msg);
456 };
457 }
458
459 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200460 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron941c1a62016-02-23 17:41:41 +0100461 unless ($archive->test_unzip) {
462 print "Unzip is not installed or incompatible.\n\n";
463 exit(1);
464 };
465
Akron08385f62016-03-22 20:37:04 +0100466 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200467 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100468
Akron941c1a62016-02-23 17:41:41 +0100469 print "Start processing ...\n";
470 $t = Benchmark->new;
471 my @dirs = $archive->list_texts;
472 $count = scalar @dirs;
473
474 ARCHIVE_LOOP:
475 for (my $i = 0; $i < $count; $i++) {
476
477 # Split path information
478 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
479
480 unless ($overwrite) {
Akron62557602016-06-27 14:10:13 +0200481
482 # This is not correct!!
Akron941c1a62016-02-23 17:41:41 +0100483 my $filename = catfile(
484 $output,
Akron62557602016-06-27 14:10:13 +0200485 get_file_name(
486 catfile($corpus, $doc, $text)
487 . '.json' . ($gzip ? '.gz' : '')
488 )
Akron941c1a62016-02-23 17:41:41 +0100489 );
490
491 if (-e $filename) {
492 $iter++;
493 print "Skip $filename\n";
494 next;
495 };
496 };
497
498 # Get the next fork
499 my $pid = $pool->start and next ARCHIVE_LOOP;
500
501 # Create temporary file
502 my $temp = File::Temp->newdir;
503
504 my $msg;
505
506 # Extract from archive
507 if ($archive->extract($dirs[$i], $temp)) {
508
509 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100510 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100511
512 # Temporary directory
513 my $dir = catdir($input, $doc, $text);
514
515 # Write file
516 $msg = write_file($dir);
517
518 $temp = undef;
519 $pool->finish(0, \$msg);
520 }
521 else {
522
523 $temp = undef;
524 $msg = "Unable to extract " . $dirs[$i] . "\n";
525 $pool->finish(1, \$msg);
526 };
527 };
528 }
529
530 else {
531 print "Input is neither a directory nor an archive.\n\n";
532 };
533
534 $pool->wait_all_children;
535
Akron11c80302016-03-18 19:44:43 +0100536 # Delete cache file
537 unlink($cache_file) if $cache_delete;
538
Akron941c1a62016-02-23 17:41:41 +0100539 print "Done.\n";
540 print timestr(timediff(Benchmark->new, $t))."\n\n";
541}
542
543# Unknown command
544else {
545 warn "Unknown command '$cmd'.\n\n";
546 pod2usage(%ERROR_HASH);
547}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000548
549__END__
Akron941c1a62016-02-23 17:41:41 +0100550
551=pod
552
553=encoding utf8
554
555=head1 NAME
556
Akronf7ad89e2016-03-16 18:22:47 +0100557korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100558
559
560=head1 SYNOPSIS
561
Akronc13a1702016-03-15 19:33:14 +0100562 $ korapxml2krill -z --input <directory> --output <filename>
563 $ korapxml2krill archive -z --input <directory> --output <directory>
564 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100565
566
567=head1 DESCRIPTION
568
569L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
570compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100571The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100572
573
574=head1 INSTALLATION
575
576The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
577
578 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
579
Akronc13a1702016-03-15 19:33:14 +0100580In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100581be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100582
583
584=head1 ARGUMENTS
585
586=over 2
587
588=item B<archive>
589
Akrone10ad322016-02-27 10:54:26 +0100590Process an archive as a Zip-file or a folder of KorAP-XML documents.
591
592=item B<extract>
593
594Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100595
596=back
597
598
599=head1 OPTIONS
600
601=over 2
602
Akron2cfe8092016-06-24 17:48:49 +0200603=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100604
Akronf7ad89e2016-03-16 18:22:47 +0100605Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100606
Akron0c3e3752016-06-28 15:55:53 +0200607Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200608that the first archive listed contains all primary data files
609and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200610
611 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
612
Akron0c3e3752016-06-28 15:55:53 +0200613(The directory structure follows the base directory format,
614that may include a C<.> root folder.
615In this case further archives lacking a C<.> root folder
616need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200617
Akron941c1a62016-02-23 17:41:41 +0100618=item B<--output|-o> <directory|file>
619
620Output folder for archive processing or
621document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100622writes to C<STDOUT> by default
623(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100624
625=item B<--overwrite|-w>
626
627Overwrite files that already exist.
628
629=item B<--token|-t> <foundry>[#<file>]
630
631Define the default tokenization by specifying
632the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100633of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100634
635=item B<--skip|-s> <foundry>[#<layer>]
636
Akronf7ad89e2016-03-16 18:22:47 +0100637Skip specific annotations by specifying the foundry
638(and optionally the layer with a C<#>-prefix),
639e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100640Can be set multiple times.
641
Akronc13a1702016-03-15 19:33:14 +0100642=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100643
Akronf7ad89e2016-03-16 18:22:47 +0100644Convert specific annotations by specifying the foundry
645(and optionally the layer with a C<#>-prefix),
646e.g. C<Mate> or C<Mate#Morpho>.
647Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100648
649=item B<--primary|-p>
650
Akronc13a1702016-03-15 19:33:14 +0100651Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100652Can be flagged using C<--no-primary> as well.
653This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100654
655=item B<--jobs|-j>
656
657Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100658for archive processing.
Akron11c80302016-03-18 19:44:43 +0100659Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100660This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100661
Akron35db6e32016-03-17 22:42:22 +0100662=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100663
Akron35db6e32016-03-17 22:42:22 +0100664Define the metadata parser to use. Defaults to C<I5>.
665Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
666This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100667
668=item B<--pretty|-y>
669
Akronc13a1702016-03-15 19:33:14 +0100670Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100671This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100672
673=item B<--gzip|-z>
674
Akronf7ad89e2016-03-16 18:22:47 +0100675Compress the output.
676Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100677
Akron11c80302016-03-18 19:44:43 +0100678=item B<--cache|-c>
679
680File to mmap a cache (using L<Cache::FastMmap>).
681Defaults to C<korapxml2krill.cache> in the calling directory.
682
683=item B<--cache-size|-cs>
684
685Size of the cache. Defaults to C<50m>.
686
687=item B<--cache-init|-ci>
688
689Initialize cache file.
690Can be flagged using C<--no-cache-init> as well.
691Defaults to C<true>.
692
693=item B<--cache-delete|-cd>
694
695Delete cache file after processing.
696Can be flagged using C<--no-cache-delete> as well.
697Defaults to C<true>.
698
Akrone10ad322016-02-27 10:54:26 +0100699=item B<--sigle|-sg>
700
701Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100702Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100703I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100704
Akron941c1a62016-02-23 17:41:41 +0100705=item B<--log|-l>
706
707The L<Log4perl> log level, defaults to C<ERROR>.
708
709=item B<--help|-h>
710
711Print this document.
712
713=item B<--version|-v>
714
715Print version information.
716
717=back
718
Akronc13a1702016-03-15 19:33:14 +0100719=head1 ANNOTATION SUPPORT
720
721L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
722developed in the KorAP project that are part of the KorAP preprocessing pipeline.
723The base foundry with paragraphs, sentences, and the text element are mandatory for
724L<Krill|https://github.com/KorAP/Krill>.
725
Akronf7ad89e2016-03-16 18:22:47 +0100726=over 2
Akronc13a1702016-03-15 19:33:14 +0100727
728=item B<Base>
729
730=over 4
731
Akronf7ad89e2016-03-16 18:22:47 +0100732=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100733
Akronf7ad89e2016-03-16 18:22:47 +0100734=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100735
736=back
737
738=item B<Connexor>
739
740=over 4
741
Akronf7ad89e2016-03-16 18:22:47 +0100742=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100743
Akronf7ad89e2016-03-16 18:22:47 +0100744=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100745
Akronf7ad89e2016-03-16 18:22:47 +0100746=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100747
Akronf7ad89e2016-03-16 18:22:47 +0100748=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100749
750=back
751
752=item B<CoreNLP>
753
754=over 4
755
Akronf7ad89e2016-03-16 18:22:47 +0100756=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100757
Akronf7ad89e2016-03-16 18:22:47 +0100758=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100759
Akronf7ad89e2016-03-16 18:22:47 +0100760=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100761
Akronf7ad89e2016-03-16 18:22:47 +0100762=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100763
764=back
765
766=item B<DeReKo>
767
768=over 4
769
Akronf7ad89e2016-03-16 18:22:47 +0100770=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100771
772=back
773
774=item B<Glemm>
775
776=over 4
777
Akronf7ad89e2016-03-16 18:22:47 +0100778=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100779
780=back
781
782=item B<Mate>
783
784=over 4
785
Akronf7ad89e2016-03-16 18:22:47 +0100786=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100787
Akronf7ad89e2016-03-16 18:22:47 +0100788=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100789
790=back
791
792=item B<OpenNLP>
793
794=over 4
795
Akronf7ad89e2016-03-16 18:22:47 +0100796=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100797
Akronf7ad89e2016-03-16 18:22:47 +0100798=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100799
800=back
801
802=item B<Sgbr>
803
804=over 4
805
Akronf7ad89e2016-03-16 18:22:47 +0100806=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100807
Akronf7ad89e2016-03-16 18:22:47 +0100808=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100809
810=back
811
812=item B<TreeTagger>
813
814=over 4
815
Akronf7ad89e2016-03-16 18:22:47 +0100816=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100817
Akronf7ad89e2016-03-16 18:22:47 +0100818=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100819
820=back
821
822=item B<XIP>
823
824=over 4
825
Akronf7ad89e2016-03-16 18:22:47 +0100826=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100827
Akronf7ad89e2016-03-16 18:22:47 +0100828=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100829
Akronf7ad89e2016-03-16 18:22:47 +0100830=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100831
832=back
833
834=back
835
836More importers are in preparation.
837New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
838See the built-in annotation importers as examples.
839
Akron941c1a62016-02-23 17:41:41 +0100840=head1 AVAILABILITY
841
842 https://github.com/KorAP/KorAP-XML-Krill
843
844
845=head1 COPYRIGHT AND LICENSE
846
847Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100848
Akron941c1a62016-02-23 17:41:41 +0100849Author: L<Nils Diewald|http://nils-diewald.de/>
850
851L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
852Corpus Analysis Platform at the
853L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
854member of the
855L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
856
857This program is free software published under the
858L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
859
860=cut