blob: cde5e37d70dced84ca9edb270232aa4d0edf1969 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010019# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010020
Akron941c1a62016-02-23 17:41:41 +010021# CHANGES:
22# ----------------------------------------------------------
23# 2013/11/25
24# - Initial release
25#
26# 2014/10/29
27# - Merges foundry data to create indexer friendly documents
28#
Akron93d620e2016-02-05 19:40:05 +010029# 2016/02/04
30# - renamed to korapxml2krill
31# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010032#
33# 2016/02/12
34# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010035# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010036#
37# 2016/02/14
38# - Added version information
Akron941c1a62016-02-23 17:41:41 +010039# - Added support for archive files
40#
41# 2016/02/15
42# - Fixed temporary directory bug
43# - Improved skipping before unzipping
44# - Added EXPERIMENTAL concurrency support
45#
46# 2016/02/23
47# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010048#
49# 2016/02/27
50# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010051#
52# 2016/03/17
53# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010054#
55# 2016/03/18
56# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020057#
Akronf3f0c942016-06-27 13:27:14 +020058# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020059# - Added multi archive support
60# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020061# - Added Malt#Dependency support
Akron941c1a62016-02-23 17:41:41 +010062# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010063
Akron35db6e32016-03-17 22:42:22 +010064our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010065our $LOCAL = $FindBin::Bin;
66our $VERSION_MSG = <<"VERSION";
67Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
68VERSION
69
70
71# Parse comand
72my $cmd;
73our @ARGV;
74if ($ARGV[0] && index($ARGV[0], '-') != 0) {
75 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010076};
Akron93d620e2016-02-05 19:40:05 +010077
Akron08385f62016-03-22 20:37:04 +010078my (@skip, @sigle, @input);
Akron35db6e32016-03-17 22:42:22 +010079my $text;
Akrone10ad322016-02-27 10:54:26 +010080
Akron941c1a62016-02-23 17:41:41 +010081# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000082GetOptions(
Akron08385f62016-03-22 20:37:04 +010083 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010084 'output|o=s' => \(my $output),
85 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010086 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010087 'token|t=s' => \(my $token_base),
88 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010089 'skip|s=s' => \@skip,
90 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010091 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
92 'cache-size|cs=s' => \(my $cache_size = '50m'),
93 'cache-delete|cd!' => \(my $cache_delete = 1),
94 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010095 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010096 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010097 'primary|p!' => \(my $primary),
98 'pretty|y' => \(my $pretty),
99 'jobs|j=i' => \(my $jobs = 0),
100 'help|h' => sub {
101 pod2usage(
102 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
103 -verbose => 99,
104 -msg => $VERSION_MSG,
105 );
106 },
107 'version|v' => sub {
108 pod2usage(
109 -verbose => 0,
110 -msg => $VERSION_MSG
111 )
112 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000113);
114
Akron941c1a62016-02-23 17:41:41 +0100115my %ERROR_HASH = (
116 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
117 -verbose => 99,
118 -msg => $VERSION_MSG,
119 -exit => 1
120);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000121
Akron941c1a62016-02-23 17:41:41 +0100122# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100123pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000124
Nils Diewald7364d1f2013-11-05 19:26:35 +0000125
Akron941c1a62016-02-23 17:41:41 +0100126# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127Log::Log4perl->init({
128 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
129 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
130 'log4perl.appender.STDERR.layout' => 'PatternLayout',
131 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
132});
133
134my $log = Log::Log4perl->get_logger('main');
135
Akron941c1a62016-02-23 17:41:41 +0100136
137# Get file name based on path information
138sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100139 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100140 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200141 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100142 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100143 $file =~ tr/\//-/;
144 $file =~ s{^-+}{};
145 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000146};
147
Akron941c1a62016-02-23 17:41:41 +0100148
149# Write file
150sub write_file {
151 my $anno = shift;
152 my $file = get_file_name $anno;
153
154 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
155
Akron08385f62016-03-22 20:37:04 +0100156 my $call = 'perl ' . $LOCAL . '/korapxml2krill';
157 $call .= ' -i ' . $anno;
158 $call .= ' -o ' . $output . '/' . $file . '.json';
Akron941c1a62016-02-23 17:41:41 +0100159 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100160 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100161 $call .= ' -w' if $overwrite;
162 $call .= ' -t ' . $token_base if $token_base;
163 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100164 $call .= ' -c ' . $cache_file;
165 $call .= ' -cs ' . $cache_size;
166 $call .= ' --no-cache-delete'; # Don't delete the cache
167 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100168 $call .= ' --no-primary ' if $primary;
169 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100170 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100171 $call .= ' -s ' . $_ foreach @skip;
172 system($call);
173 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000174};
175
Nils Diewald2db9ad02013-10-29 19:26:43 +0000176
Akrone10ad322016-02-27 10:54:26 +0100177# Convert sigle to path construct
178s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
179
Akron941c1a62016-02-23 17:41:41 +0100180# Process a single file
181unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100182 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000183
Akron941c1a62016-02-23 17:41:41 +0100184 # Can't print gzip to STDOUT
185 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000186
Akron941c1a62016-02-23 17:41:41 +0100187 my %skip;
188 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000189
Akron941c1a62016-02-23 17:41:41 +0100190 # Ignore processing
191 if (!$overwrite && $output && -e $output) {
192 $log->trace($output . ' already exists');
193 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000194 };
Akron941c1a62016-02-23 17:41:41 +0100195
196 BEGIN {
197 $main::TIME = Benchmark->new;
198 $main::LAST_STOP = Benchmark->new;
199 };
200
201 sub stop_time {
202 my $new = Benchmark->new;
203 $log->trace(
204 'The code took: '.
205 timestr(timediff($new, $main::LAST_STOP)) .
206 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
207 );
208 $main::LAST_STOP = $new;
209 };
210
211 # Create and parse new document
212 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100213 my $doc = KorAP::XML::Krill->new(
214 path => $input,
Akron11c80302016-03-18 19:44:43 +0100215 meta_type => ($meta // 'I5'),
216 cache => Cache::FastMmap->new(
217 share_file => $cache_file,
218 cache_size => $cache_size,
219 init_file => $cache_init
220 )
Akron35db6e32016-03-17 22:42:22 +0100221 );
Akron941c1a62016-02-23 17:41:41 +0100222
223 unless ($doc->parse) {
224 $log->warn($output . " can't be processed - no document data");
225 exit(0);
226 };
227
228 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
229 if ($token_base) {
230 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
231 };
232
233 # Get tokenization
234 my $tokens = KorAP::XML::Tokenizer->new(
235 path => $doc->path,
236 doc => $doc,
237 foundry => $token_base_foundry,
238 layer => $token_base_layer,
239 name => 'tokens'
240 );
241
242 # Unable to process base tokenization
243 unless ($tokens->parse) {
244 $log->error($output . " can't be processed - no base tokenization");
245 exit(0);
246 };
247
248 my @layers;
249 push(@layers, ['Base', 'Sentences']);
250 push(@layers, ['Base', 'Paragraphs']);
251
252 # Connexor
253 push(@layers, ['Connexor', 'Morpho']);
254 push(@layers, ['Connexor', 'Syntax']);
255 push(@layers, ['Connexor', 'Phrase']);
256 push(@layers, ['Connexor', 'Sentences']);
257
258 # CoreNLP
259 push(@layers, ['CoreNLP', 'NamedEntities']);
260 push(@layers, ['CoreNLP', 'Sentences']);
261 push(@layers, ['CoreNLP', 'Morpho']);
262 push(@layers, ['CoreNLP', 'Constituency']);
263
264 # DeReKo
265 push(@layers, ['DeReKo', 'Structure']);
266
267 # Glemm
268 push(@layers, ['Glemm', 'Morpho']);
269
270 # Malt
Akronf3f0c942016-06-27 13:27:14 +0200271 push(@layers, ['Malt', 'Dependency']);
Akron941c1a62016-02-23 17:41:41 +0100272
273 # Mate
274 push(@layers, ['Mate', 'Morpho']);
275 push(@layers, ['Mate', 'Dependency']);
276
277 # OpenNLP
278 push(@layers, ['OpenNLP', 'Morpho']);
279 push(@layers, ['OpenNLP', 'Sentences']);
280
281 # Schreibgebrauch
282 push(@layers, ['Sgbr', 'Lemma']);
283 push(@layers, ['Sgbr', 'Morpho']);
284
285 # TreeTagger
286 push(@layers, ['TreeTagger', 'Morpho']);
287 push(@layers, ['TreeTagger', 'Sentences']);
288
289 # XIP
290 push(@layers, ['XIP', 'Morpho']);
291 push(@layers, ['XIP', 'Constituency']);
292 push(@layers, ['XIP', 'Sentences']);
293 push(@layers, ['XIP', 'Dependency']);
294
295
296 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100297 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100298 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000299 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000300 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000301 }
302 else {
Akron941c1a62016-02-23 17:41:41 +0100303 # Add to index file - respect skipping
304 foreach my $info (@layers) {
305 # Skip if Foundry or Foundry#Layer should be skipped
306 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
307 $tokens->add(@$info);
308 stop_time;
309 };
310 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000311 };
312
Akron941c1a62016-02-23 17:41:41 +0100313 my $file;
Akron35db6e32016-03-17 22:42:22 +0100314 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100315
316 if ($output) {
317
318 if ($gzip) {
319 $file = IO::Compress::Gzip->new($output, Minimal => 1);
320 }
321 else {
322 $file = IO::File->new($output, "w");
323 };
324
325 $file->print($print_text);
326 $file->close;
327 }
328
329 else {
330 print $print_text . "\n";
331 };
332
Akron11c80302016-03-18 19:44:43 +0100333 # Delete cache file
334 unlink($cache_file) if $cache_delete;
335
Akron941c1a62016-02-23 17:41:41 +0100336 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000337}
Nils Diewald59094f22014-11-05 18:20:50 +0000338
Akrone10ad322016-02-27 10:54:26 +0100339# Extract XML files
340elsif ($cmd eq 'extract') {
341
Akron08385f62016-03-22 20:37:04 +0100342 my $input = $input[0];
343
Akrone10ad322016-02-27 10:54:26 +0100344 pod2usage(%ERROR_HASH) unless $output;
345
346 # TODO: Support sigles and full archives
347
348 if ($output && (!-e $output || !-d $output)) {
349 print "Directory '$output' does not exist.\n\n";
350 exit(0);
351 };
352
Akron08385f62016-03-22 20:37:04 +0100353#TODOOOOOO
354
Akrone10ad322016-02-27 10:54:26 +0100355 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
356
357 unless ($archive->test_unzip) {
358 print "Unzip is not installed or incompatible.\n\n";
359 exit(1);
360 };
361
Akrone10ad322016-02-27 10:54:26 +0100362 # Iterate over all given sigles and extract
363 foreach (@sigle) {
364 print "$_ ";
365 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
366 print "extracted.\n";
367 };
368
369 print "\n";
370 exit(1);
371 };
372}
373
Akron941c1a62016-02-23 17:41:41 +0100374# Process an archive
375elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000376
Akrone10ad322016-02-27 10:54:26 +0100377 # TODO: Support sigles
378
Akron941c1a62016-02-23 17:41:41 +0100379 pod2usage(%ERROR_HASH) unless $output;
380
381 if ($output && (!-e $output || !-d $output)) {
382 print "Directory '$output' does not exist.\n\n";
383 exit(0);
384 };
385
386 # Zero means: everything runs in the parent process
387 my $pool = Parallel::ForkManager->new($jobs);
388
389 my $count = 0; # Texts to process
390 my $iter = 1; # Current text in process
391
392 # Report on fork message
393 $pool->run_on_finish (
394 sub {
395 my ($pid, $code) = shift;
396 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100397 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100398 ($iter++) . "/$count]" .
399 ($code ? " $code" : '') .
400 " $$data\n";
401 }
402 );
403
404 my $t;
405 print "Reading data ...\n";
406
Akron11c80302016-03-18 19:44:43 +0100407 unless (Cache::FastMmap->new(
408 share_file => $cache_file,
409 cache_size => $cache_size,
410 init_file => $cache_init
411 )) {
412 print "Unable to intialize cache '$cache_file'\n\n";
413 exit(1);
414 };
415
Akron941c1a62016-02-23 17:41:41 +0100416 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100417 if (-d $input[0]) {
418 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100419 my @dirs;
420 my $dir;
421
422 while (1) {
423 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
424 push @dirs, $dir;
425 $it->prune;
426 };
427 last unless $it->next;
428 };
429
430 print "Start processing ...\n";
431 $t = Benchmark->new;
432 $count = scalar @dirs;
433
434 DIRECTORY_LOOP:
435 for (my $i = 0; $i < $count; $i++) {
436
437 unless ($overwrite) {
438 my $filename = catfile(
439 $output,
440 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
441 );
442
443 if (-e $filename) {
444 $iter++;
445 print "Skip $filename\n";
446 next;
447 };
448 };
449
450 # Get the next fork
451 my $pid = $pool->start and next DIRECTORY_LOOP;
452 my $msg;
453
454 $msg = write_file($dirs[$i]);
455 $pool->finish(0, \$msg);
456 };
457 }
458
459 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200460 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron941c1a62016-02-23 17:41:41 +0100461 unless ($archive->test_unzip) {
462 print "Unzip is not installed or incompatible.\n\n";
463 exit(1);
464 };
465
Akron08385f62016-03-22 20:37:04 +0100466 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200467 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100468
Akron941c1a62016-02-23 17:41:41 +0100469 print "Start processing ...\n";
470 $t = Benchmark->new;
471 my @dirs = $archive->list_texts;
472 $count = scalar @dirs;
473
474 ARCHIVE_LOOP:
475 for (my $i = 0; $i < $count; $i++) {
476
477 # Split path information
478 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
479
480 unless ($overwrite) {
Akron62557602016-06-27 14:10:13 +0200481
482 # This is not correct!!
Akron941c1a62016-02-23 17:41:41 +0100483 my $filename = catfile(
484 $output,
Akron62557602016-06-27 14:10:13 +0200485 get_file_name(
486 catfile($corpus, $doc, $text)
487 . '.json' . ($gzip ? '.gz' : '')
488 )
Akron941c1a62016-02-23 17:41:41 +0100489 );
490
491 if (-e $filename) {
492 $iter++;
493 print "Skip $filename\n";
494 next;
495 };
496 };
497
498 # Get the next fork
499 my $pid = $pool->start and next ARCHIVE_LOOP;
500
501 # Create temporary file
502 my $temp = File::Temp->newdir;
503
504 my $msg;
505
506 # Extract from archive
507 if ($archive->extract($dirs[$i], $temp)) {
508
509 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100510 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100511
512 # Temporary directory
513 my $dir = catdir($input, $doc, $text);
514
515 # Write file
516 $msg = write_file($dir);
517
518 $temp = undef;
519 $pool->finish(0, \$msg);
520 }
521 else {
522
523 $temp = undef;
524 $msg = "Unable to extract " . $dirs[$i] . "\n";
525 $pool->finish(1, \$msg);
526 };
527 };
528 }
529
530 else {
531 print "Input is neither a directory nor an archive.\n\n";
532 };
533
534 $pool->wait_all_children;
535
Akron11c80302016-03-18 19:44:43 +0100536 # Delete cache file
537 unlink($cache_file) if $cache_delete;
538
Akron941c1a62016-02-23 17:41:41 +0100539 print "Done.\n";
540 print timestr(timediff(Benchmark->new, $t))."\n\n";
541}
542
543# Unknown command
544else {
545 warn "Unknown command '$cmd'.\n\n";
546 pod2usage(%ERROR_HASH);
547}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000548
549__END__
Akron941c1a62016-02-23 17:41:41 +0100550
551=pod
552
553=encoding utf8
554
555=head1 NAME
556
Akronf7ad89e2016-03-16 18:22:47 +0100557korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100558
559
560=head1 SYNOPSIS
561
Akronc13a1702016-03-15 19:33:14 +0100562 $ korapxml2krill -z --input <directory> --output <filename>
563 $ korapxml2krill archive -z --input <directory> --output <directory>
564 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100565
566
567=head1 DESCRIPTION
568
569L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
570compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100571The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100572
573
574=head1 INSTALLATION
575
576The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
577
578 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
579
Akronc13a1702016-03-15 19:33:14 +0100580In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100581be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100582
583
584=head1 ARGUMENTS
585
586=over 2
587
588=item B<archive>
589
Akrone10ad322016-02-27 10:54:26 +0100590Process an archive as a Zip-file or a folder of KorAP-XML documents.
591
592=item B<extract>
593
594Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100595
596=back
597
598
599=head1 OPTIONS
600
601=over 2
602
Akron2cfe8092016-06-24 17:48:49 +0200603=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100604
Akronf7ad89e2016-03-16 18:22:47 +0100605Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100606
Akron2cfe8092016-06-24 17:48:49 +0200607Multiple input archives are supported for archiving,
608with the constraint,
609that the first archive listed contains all primary data files
610and all meta data files.
611The directory structure follows the base directory format,
612starting with a C<.> root folder.
613In case an attached archive has no C<.> root folder,
614the archive path should start with a hash.
615
616 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
617
618
Akron941c1a62016-02-23 17:41:41 +0100619=item B<--output|-o> <directory|file>
620
621Output folder for archive processing or
622document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100623writes to C<STDOUT> by default
624(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100625
626=item B<--overwrite|-w>
627
628Overwrite files that already exist.
629
630=item B<--token|-t> <foundry>[#<file>]
631
632Define the default tokenization by specifying
633the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100634of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100635
636=item B<--skip|-s> <foundry>[#<layer>]
637
Akronf7ad89e2016-03-16 18:22:47 +0100638Skip specific annotations by specifying the foundry
639(and optionally the layer with a C<#>-prefix),
640e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100641Can be set multiple times.
642
Akronc13a1702016-03-15 19:33:14 +0100643=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100644
Akronf7ad89e2016-03-16 18:22:47 +0100645Convert specific annotations by specifying the foundry
646(and optionally the layer with a C<#>-prefix),
647e.g. C<Mate> or C<Mate#Morpho>.
648Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100649
650=item B<--primary|-p>
651
Akronc13a1702016-03-15 19:33:14 +0100652Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100653Can be flagged using C<--no-primary> as well.
654This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100655
656=item B<--jobs|-j>
657
658Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100659for archive processing.
Akron11c80302016-03-18 19:44:43 +0100660Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100661This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100662
Akron35db6e32016-03-17 22:42:22 +0100663=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100664
Akron35db6e32016-03-17 22:42:22 +0100665Define the metadata parser to use. Defaults to C<I5>.
666Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
667This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100668
669=item B<--pretty|-y>
670
Akronc13a1702016-03-15 19:33:14 +0100671Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100672This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100673
674=item B<--gzip|-z>
675
Akronf7ad89e2016-03-16 18:22:47 +0100676Compress the output.
677Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100678
Akron11c80302016-03-18 19:44:43 +0100679=item B<--cache|-c>
680
681File to mmap a cache (using L<Cache::FastMmap>).
682Defaults to C<korapxml2krill.cache> in the calling directory.
683
684=item B<--cache-size|-cs>
685
686Size of the cache. Defaults to C<50m>.
687
688=item B<--cache-init|-ci>
689
690Initialize cache file.
691Can be flagged using C<--no-cache-init> as well.
692Defaults to C<true>.
693
694=item B<--cache-delete|-cd>
695
696Delete cache file after processing.
697Can be flagged using C<--no-cache-delete> as well.
698Defaults to C<true>.
699
Akrone10ad322016-02-27 10:54:26 +0100700=item B<--sigle|-sg>
701
702Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100703Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100704I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100705
Akron941c1a62016-02-23 17:41:41 +0100706=item B<--log|-l>
707
708The L<Log4perl> log level, defaults to C<ERROR>.
709
710=item B<--help|-h>
711
712Print this document.
713
714=item B<--version|-v>
715
716Print version information.
717
718=back
719
Akronc13a1702016-03-15 19:33:14 +0100720=head1 ANNOTATION SUPPORT
721
722L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
723developed in the KorAP project that are part of the KorAP preprocessing pipeline.
724The base foundry with paragraphs, sentences, and the text element are mandatory for
725L<Krill|https://github.com/KorAP/Krill>.
726
Akronf7ad89e2016-03-16 18:22:47 +0100727=over 2
Akronc13a1702016-03-15 19:33:14 +0100728
729=item B<Base>
730
731=over 4
732
Akronf7ad89e2016-03-16 18:22:47 +0100733=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100734
Akronf7ad89e2016-03-16 18:22:47 +0100735=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100736
737=back
738
739=item B<Connexor>
740
741=over 4
742
Akronf7ad89e2016-03-16 18:22:47 +0100743=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100744
Akronf7ad89e2016-03-16 18:22:47 +0100745=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100746
Akronf7ad89e2016-03-16 18:22:47 +0100747=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100748
Akronf7ad89e2016-03-16 18:22:47 +0100749=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100750
751=back
752
753=item B<CoreNLP>
754
755=over 4
756
Akronf7ad89e2016-03-16 18:22:47 +0100757=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100758
Akronf7ad89e2016-03-16 18:22:47 +0100759=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100760
Akronf7ad89e2016-03-16 18:22:47 +0100761=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100762
Akronf7ad89e2016-03-16 18:22:47 +0100763=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100764
765=back
766
767=item B<DeReKo>
768
769=over 4
770
Akronf7ad89e2016-03-16 18:22:47 +0100771=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100772
773=back
774
775=item B<Glemm>
776
777=over 4
778
Akronf7ad89e2016-03-16 18:22:47 +0100779=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100780
781=back
782
783=item B<Mate>
784
785=over 4
786
Akronf7ad89e2016-03-16 18:22:47 +0100787=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100788
Akronf7ad89e2016-03-16 18:22:47 +0100789=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100790
791=back
792
793=item B<OpenNLP>
794
795=over 4
796
Akronf7ad89e2016-03-16 18:22:47 +0100797=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100798
Akronf7ad89e2016-03-16 18:22:47 +0100799=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100800
801=back
802
803=item B<Sgbr>
804
805=over 4
806
Akronf7ad89e2016-03-16 18:22:47 +0100807=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100808
Akronf7ad89e2016-03-16 18:22:47 +0100809=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100810
811=back
812
813=item B<TreeTagger>
814
815=over 4
816
Akronf7ad89e2016-03-16 18:22:47 +0100817=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100818
Akronf7ad89e2016-03-16 18:22:47 +0100819=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100820
821=back
822
823=item B<XIP>
824
825=over 4
826
Akronf7ad89e2016-03-16 18:22:47 +0100827=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100828
Akronf7ad89e2016-03-16 18:22:47 +0100829=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100830
Akronf7ad89e2016-03-16 18:22:47 +0100831=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100832
833=back
834
835=back
836
837More importers are in preparation.
838New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
839See the built-in annotation importers as examples.
840
Akron941c1a62016-02-23 17:41:41 +0100841=head1 AVAILABILITY
842
843 https://github.com/KorAP/KorAP-XML-Krill
844
845
846=head1 COPYRIGHT AND LICENSE
847
848Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100849
Akron941c1a62016-02-23 17:41:41 +0100850Author: L<Nils Diewald|http://nils-diewald.de/>
851
852L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
853Corpus Analysis Platform at the
854L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
855member of the
856L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
857
858This program is free software published under the
859L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
860
861=cut