blob: e668f0749da36053ebd587934433597b676c5417 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron941c1a62016-02-23 17:41:41 +010066# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010067
Akron5f51d422016-08-16 16:26:43 +020068our $LAST_CHANGE = '2016/08/16';
Akron941c1a62016-02-23 17:41:41 +010069our $LOCAL = $FindBin::Bin;
70our $VERSION_MSG = <<"VERSION";
71Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
72VERSION
73
Akron941c1a62016-02-23 17:41:41 +010074# Parse comand
75my $cmd;
76our @ARGV;
77if ($ARGV[0] && index($ARGV[0], '-') != 0) {
78 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010079};
Akron93d620e2016-02-05 19:40:05 +010080
Akron5f51d422016-08-16 16:26:43 +020081my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010082my $text;
Akrone10ad322016-02-27 10:54:26 +010083
Akron941c1a62016-02-23 17:41:41 +010084# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000085GetOptions(
Akron08385f62016-03-22 20:37:04 +010086 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010087 'output|o=s' => \(my $output),
88 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010089 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010090 'token|t=s' => \(my $token_base),
91 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010092 'skip|s=s' => \@skip,
93 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010094 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
95 'cache-size|cs=s' => \(my $cache_size = '50m'),
96 'cache-delete|cd!' => \(my $cache_delete = 1),
97 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010098 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +020099 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100100 'primary|p!' => \(my $primary),
101 'pretty|y' => \(my $pretty),
102 'jobs|j=i' => \(my $jobs = 0),
103 'help|h' => sub {
104 pod2usage(
105 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
106 -verbose => 99,
107 -msg => $VERSION_MSG,
108 );
109 },
110 'version|v' => sub {
111 pod2usage(
112 -verbose => 0,
113 -msg => $VERSION_MSG
114 )
115 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000116);
117
Akron941c1a62016-02-23 17:41:41 +0100118my %ERROR_HASH = (
119 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
120 -verbose => 99,
121 -msg => $VERSION_MSG,
122 -exit => 1
123);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000124
Akron941c1a62016-02-23 17:41:41 +0100125# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100126pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127
Akrone1dbc382016-07-08 22:24:52 +0200128# Gzip has no effect, if no output is given
129pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000130
Akron941c1a62016-02-23 17:41:41 +0100131# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000132Log::Log4perl->init({
133 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
134 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
135 'log4perl.appender.STDERR.layout' => 'PatternLayout',
136 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
137});
138
139my $log = Log::Log4perl->get_logger('main');
140
Akrone1dbc382016-07-08 22:24:52 +0200141my %skip;
142$skip{lc($_)} = 1 foreach @skip;
143
144my @layers;
145push(@layers, ['Base', 'Sentences']);
146push(@layers, ['Base', 'Paragraphs']);
147
148# Connexor
149push(@layers, ['Connexor', 'Morpho']);
150push(@layers, ['Connexor', 'Syntax']);
151push(@layers, ['Connexor', 'Phrase']);
152push(@layers, ['Connexor', 'Sentences']);
153
154# CoreNLP
155push(@layers, ['CoreNLP', 'NamedEntities']);
156push(@layers, ['CoreNLP', 'Sentences']);
157push(@layers, ['CoreNLP', 'Morpho']);
158push(@layers, ['CoreNLP', 'Constituency']);
159
160# DeReKo
161push(@layers, ['DeReKo', 'Structure']);
162
163# Glemm
164push(@layers, ['Glemm', 'Morpho']);
165
166# Malt
167push(@layers, ['Malt', 'Dependency']);
168
169# MDParser
170push(@layers, ['MDParser', 'Dependency']);
171
172# Mate
173push(@layers, ['Mate', 'Morpho']);
174push(@layers, ['Mate', 'Dependency']);
175
176# OpenNLP
177push(@layers, ['OpenNLP', 'Morpho']);
178push(@layers, ['OpenNLP', 'Sentences']);
179
180# Schreibgebrauch
181push(@layers, ['Sgbr', 'Lemma']);
182push(@layers, ['Sgbr', 'Morpho']);
183
184# TreeTagger
185push(@layers, ['TreeTagger', 'Morpho']);
186push(@layers, ['TreeTagger', 'Sentences']);
187
188# XIP
189push(@layers, ['XIP', 'Morpho']);
190push(@layers, ['XIP', 'Constituency']);
191push(@layers, ['XIP', 'Sentences']);
192push(@layers, ['XIP', 'Dependency']);
193
194# Check filters
195my @filtered_anno;
196if ($skip{'#all'}) {
197 foreach (@anno) {
198 push @filtered_anno, [ split('#', $_) ];
199 };
200}
201
202# Add all annotations that are not skipped
203else {
204 # Add to index file - respect skipping
205 foreach my $info (@layers) {
206 # Skip if Foundry or Foundry#Layer should be skipped
207 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
208 push @filtered_anno, $info;
209 };
210 };
211};
212
213# Get tokenization basis
214my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
215
216# TODO: This should not be initialized for batch
217my $cache = Cache::FastMmap->new(
218 share_file => $cache_file,
219 cache_size => $cache_size,
220 init_file => $cache_init
221);
222
223my $batch_file = KorAP::XML::Batch::File->new(
224 cache => $cache,
225 meta_type => $meta,
226 overwrite => $overwrite,
227 foundry => $token_base_foundry,
228 layer => $token_base_layer,
229 gzip => $gzip,
230 log => $log,
231 primary => $primary,
232 pretty => $pretty,
233 anno => \@filtered_anno
234);
235
Akron941c1a62016-02-23 17:41:41 +0100236
237# Get file name based on path information
238sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100239 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100240 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200241 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100242 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100243 $file =~ tr/\//-/;
244 $file =~ s{^-+}{};
245 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000246};
247
Akron941c1a62016-02-23 17:41:41 +0100248
249# Write file
Akrone1dbc382016-07-08 22:24:52 +0200250#sub write_file {
251# my $anno = shift;
252# my $file = get_file_name $anno;
253#
254# # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
255#
256# my $call = 'perl ' . $LOCAL . '/korapxml2krill';
257# $call .= ' -i ' . $anno;
258# $call .= ' -o ' . $output . '/' . $file . '.json';
259# $call .= '.gz -z' if $gzip;
260# $call .= ' -m ' . $meta if $meta;
261# $call .= ' -w' if $overwrite;
262# $call .= ' -t ' . $token_base if $token_base;
263# $call .= ' -l ' . $log_level if $log_level;
264# $call .= ' -c ' . $cache_file;
265# $call .= ' -cs ' . $cache_size;
266# $call .= ' --no-cache-delete'; # Don't delete the cache
267# $call .= ' --no-cache-init'; # Don't initialize the cache
268# $call .= ' --no-primary ' if $primary;
269# $call .= ' -y ' . $pretty if $pretty;
270# $call .= ' -a ' . $_ foreach @anno;
271# $call .= ' -s ' . $_ foreach @skip;
272# system($call);
273# return "$file";
274#};
Nils Diewald7364d1f2013-11-05 19:26:35 +0000275
Nils Diewald2db9ad02013-10-29 19:26:43 +0000276
Akrone10ad322016-02-27 10:54:26 +0100277# Convert sigle to path construct
278s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
279
Akron941c1a62016-02-23 17:41:41 +0100280# Process a single file
281unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100282 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000283
Akron941c1a62016-02-23 17:41:41 +0100284 BEGIN {
285 $main::TIME = Benchmark->new;
286 $main::LAST_STOP = Benchmark->new;
287 };
288
289 sub stop_time {
290 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200291 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100292 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200293 timestr(timediff($new, $main::LAST_STOP)) .
294 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
295 );
Akron941c1a62016-02-23 17:41:41 +0100296 $main::LAST_STOP = $new;
297 };
298
299 # Create and parse new document
300 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100301
Akrone1dbc382016-07-08 22:24:52 +0200302 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100303
Akron11c80302016-03-18 19:44:43 +0100304 # Delete cache file
305 unlink($cache_file) if $cache_delete;
306
Akron5f51d422016-08-16 16:26:43 +0200307 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000308}
Nils Diewald59094f22014-11-05 18:20:50 +0000309
Akrone10ad322016-02-27 10:54:26 +0100310# Extract XML files
311elsif ($cmd eq 'extract') {
312
Akrone1dbc382016-07-08 22:24:52 +0200313warn '!!!!!!!!!!!!!------------> ';
Akrone10ad322016-02-27 10:54:26 +0100314
Akrone1dbc382016-07-08 22:24:52 +0200315if ($output && (!-e $output || !-d $output)) {
316 print "Directory '$output' does not exist.\n\n";
317 exit(0);
318};
319
Akrone10ad322016-02-27 10:54:26 +0100320
Akronb0c88db2016-06-29 16:33:18 +0200321 # TODO: Support sigles and full archives
Akron08385f62016-03-22 20:37:04 +0100322
Akronb0c88db2016-06-29 16:33:18 +0200323 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100324
325 unless ($archive->test_unzip) {
326 print "Unzip is not installed or incompatible.\n\n";
327 exit(1);
328 };
329
Akronb0c88db2016-06-29 16:33:18 +0200330 # Add further annotation archived
331 $archive->attach($_) foreach @input;
332
Akrone10ad322016-02-27 10:54:26 +0100333 # Iterate over all given sigles and extract
334 foreach (@sigle) {
335 print "$_ ";
Akronb0c88db2016-06-29 16:33:18 +0200336 print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
Akrone10ad322016-02-27 10:54:26 +0100337 print "extracted.\n";
338 };
339
340 print "\n";
341 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200342 }
343 else {
344 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100345 };
346}
347
Akron941c1a62016-02-23 17:41:41 +0100348# Process an archive
349elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000350
Akrone1dbc382016-07-08 22:24:52 +0200351warn '!!!!!!!!!!!!!------------> ';
Akrone10ad322016-02-27 10:54:26 +0100352
Akrone1dbc382016-07-08 22:24:52 +0200353if ($output && (!-e $output || !-d $output)) {
354 print "Directory '$output' does not exist.\n\n";
355 exit(0);
356};
357
358
359 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100360
361 if ($output && (!-e $output || !-d $output)) {
362 print "Directory '$output' does not exist.\n\n";
363 exit(0);
364 };
365
366 # Zero means: everything runs in the parent process
367 my $pool = Parallel::ForkManager->new($jobs);
368
369 my $count = 0; # Texts to process
370 my $iter = 1; # Current text in process
371
372 # Report on fork message
373 $pool->run_on_finish (
374 sub {
375 my ($pid, $code) = shift;
376 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100377 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100378 ($iter++) . "/$count]" .
379 ($code ? " $code" : '') .
380 " $$data\n";
381 }
382 );
383
384 my $t;
385 print "Reading data ...\n";
386
Akrone1dbc382016-07-08 22:24:52 +0200387# unless (Cache::FastMmap->new(
388# share_file => $cache_file,
389# cache_size => $cache_size,
390# init_file => $cache_init
391# )) {
392# print "Unable to intialize cache '$cache_file'\n\n";
393# exit(1);
394# };
Akron11c80302016-03-18 19:44:43 +0100395
Akron941c1a62016-02-23 17:41:41 +0100396 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100397 if (-d $input[0]) {
398 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100399 my @dirs;
400 my $dir;
401
402 while (1) {
403 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
404 push @dirs, $dir;
405 $it->prune;
406 };
407 last unless $it->next;
408 };
409
410 print "Start processing ...\n";
411 $t = Benchmark->new;
412 $count = scalar @dirs;
413
414 DIRECTORY_LOOP:
415 for (my $i = 0; $i < $count; $i++) {
416
Akrone1dbc382016-07-08 22:24:52 +0200417 my $filename = catfile(
418 $output,
419 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
420 );
Akron941c1a62016-02-23 17:41:41 +0100421
422 # Get the next fork
423 my $pid = $pool->start and next DIRECTORY_LOOP;
424 my $msg;
425
Akrone1dbc382016-07-08 22:24:52 +0200426 $msg = $batch_file->process($dirs[$i] => $filename);
Akron941c1a62016-02-23 17:41:41 +0100427 $pool->finish(0, \$msg);
428 };
429 }
430
431 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200432 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200433
Akron941c1a62016-02-23 17:41:41 +0100434 unless ($archive->test_unzip) {
435 print "Unzip is not installed or incompatible.\n\n";
436 exit(1);
437 };
438
Akron08385f62016-03-22 20:37:04 +0100439 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200440 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100441
Akron941c1a62016-02-23 17:41:41 +0100442 print "Start processing ...\n";
443 $t = Benchmark->new;
444 my @dirs = $archive->list_texts;
445 $count = scalar @dirs;
446
447 ARCHIVE_LOOP:
448 for (my $i = 0; $i < $count; $i++) {
449
450 # Split path information
451 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
452
Akrone1dbc382016-07-08 22:24:52 +0200453 my $filename = catfile(
454 $output,
455 get_file_name(
456 catfile($corpus, $doc, $text)
457 . '.json' . ($gzip ? '.gz' : '')
458 )
459 );
Akron941c1a62016-02-23 17:41:41 +0100460
461 # Get the next fork
462 my $pid = $pool->start and next ARCHIVE_LOOP;
463
464 # Create temporary file
465 my $temp = File::Temp->newdir;
466
467 my $msg;
468
469 # Extract from archive
470 if ($archive->extract($dirs[$i], $temp)) {
471
472 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100473 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100474
475 # Temporary directory
476 my $dir = catdir($input, $doc, $text);
477
478 # Write file
Akrone1dbc382016-07-08 22:24:52 +0200479 $msg = $batch_file->process($dir => $output);
Akron941c1a62016-02-23 17:41:41 +0100480
481 $temp = undef;
482 $pool->finish(0, \$msg);
483 }
484 else {
485
486 $temp = undef;
487 $msg = "Unable to extract " . $dirs[$i] . "\n";
488 $pool->finish(1, \$msg);
489 };
490 };
491 }
492
493 else {
494 print "Input is neither a directory nor an archive.\n\n";
495 };
496
497 $pool->wait_all_children;
498
Akron11c80302016-03-18 19:44:43 +0100499 # Delete cache file
500 unlink($cache_file) if $cache_delete;
501
Akron941c1a62016-02-23 17:41:41 +0100502 print "Done.\n";
503 print timestr(timediff(Benchmark->new, $t))."\n\n";
504}
505
506# Unknown command
507else {
508 warn "Unknown command '$cmd'.\n\n";
509 pod2usage(%ERROR_HASH);
510}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000511
512__END__
Akron941c1a62016-02-23 17:41:41 +0100513
514=pod
515
516=encoding utf8
517
518=head1 NAME
519
Akronf7ad89e2016-03-16 18:22:47 +0100520korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100521
522
523=head1 SYNOPSIS
524
Akronc13a1702016-03-15 19:33:14 +0100525 $ korapxml2krill -z --input <directory> --output <filename>
526 $ korapxml2krill archive -z --input <directory> --output <directory>
527 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100528
529
530=head1 DESCRIPTION
531
532L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
533compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100534The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100535
536
537=head1 INSTALLATION
538
539The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
540
541 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
542
Akronc13a1702016-03-15 19:33:14 +0100543In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100544be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100545
546
547=head1 ARGUMENTS
548
549=over 2
550
551=item B<archive>
552
Akrone10ad322016-02-27 10:54:26 +0100553Process an archive as a Zip-file or a folder of KorAP-XML documents.
554
555=item B<extract>
556
557Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100558
559=back
560
561
562=head1 OPTIONS
563
564=over 2
565
Akron2cfe8092016-06-24 17:48:49 +0200566=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100567
Akronf7ad89e2016-03-16 18:22:47 +0100568Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100569
Akron0c3e3752016-06-28 15:55:53 +0200570Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200571that the first archive listed contains all primary data files
572and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200573
574 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
575
Akron0c3e3752016-06-28 15:55:53 +0200576(The directory structure follows the base directory format,
577that may include a C<.> root folder.
578In this case further archives lacking a C<.> root folder
579need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200580
Akron941c1a62016-02-23 17:41:41 +0100581=item B<--output|-o> <directory|file>
582
583Output folder for archive processing or
584document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100585writes to C<STDOUT> by default
586(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100587
588=item B<--overwrite|-w>
589
590Overwrite files that already exist.
591
592=item B<--token|-t> <foundry>[#<file>]
593
594Define the default tokenization by specifying
595the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100596of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100597
598=item B<--skip|-s> <foundry>[#<layer>]
599
Akronf7ad89e2016-03-16 18:22:47 +0100600Skip specific annotations by specifying the foundry
601(and optionally the layer with a C<#>-prefix),
602e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100603Can be set multiple times.
604
Akronc13a1702016-03-15 19:33:14 +0100605=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100606
Akronf7ad89e2016-03-16 18:22:47 +0100607Convert specific annotations by specifying the foundry
608(and optionally the layer with a C<#>-prefix),
609e.g. C<Mate> or C<Mate#Morpho>.
610Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100611
612=item B<--primary|-p>
613
Akronc13a1702016-03-15 19:33:14 +0100614Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100615Can be flagged using C<--no-primary> as well.
616This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100617
618=item B<--jobs|-j>
619
620Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100621for archive processing.
Akron11c80302016-03-18 19:44:43 +0100622Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100623This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100624
Akron35db6e32016-03-17 22:42:22 +0100625=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100626
Akron35db6e32016-03-17 22:42:22 +0100627Define the metadata parser to use. Defaults to C<I5>.
628Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
629This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100630
631=item B<--pretty|-y>
632
Akronc13a1702016-03-15 19:33:14 +0100633Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100634This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100635
636=item B<--gzip|-z>
637
Akronf7ad89e2016-03-16 18:22:47 +0100638Compress the output.
639Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100640
Akron11c80302016-03-18 19:44:43 +0100641=item B<--cache|-c>
642
643File to mmap a cache (using L<Cache::FastMmap>).
644Defaults to C<korapxml2krill.cache> in the calling directory.
645
646=item B<--cache-size|-cs>
647
648Size of the cache. Defaults to C<50m>.
649
650=item B<--cache-init|-ci>
651
652Initialize cache file.
653Can be flagged using C<--no-cache-init> as well.
654Defaults to C<true>.
655
656=item B<--cache-delete|-cd>
657
658Delete cache file after processing.
659Can be flagged using C<--no-cache-delete> as well.
660Defaults to C<true>.
661
Akrone10ad322016-02-27 10:54:26 +0100662=item B<--sigle|-sg>
663
664Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100665Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100666I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200667Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100668
Akron941c1a62016-02-23 17:41:41 +0100669=item B<--log|-l>
670
671The L<Log4perl> log level, defaults to C<ERROR>.
672
673=item B<--help|-h>
674
675Print this document.
676
677=item B<--version|-v>
678
679Print version information.
680
681=back
682
Akronc13a1702016-03-15 19:33:14 +0100683=head1 ANNOTATION SUPPORT
684
685L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
686developed in the KorAP project that are part of the KorAP preprocessing pipeline.
687The base foundry with paragraphs, sentences, and the text element are mandatory for
688L<Krill|https://github.com/KorAP/Krill>.
689
Akronf7ad89e2016-03-16 18:22:47 +0100690=over 2
Akronc13a1702016-03-15 19:33:14 +0100691
692=item B<Base>
693
694=over 4
695
Akronf7ad89e2016-03-16 18:22:47 +0100696=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100697
Akronf7ad89e2016-03-16 18:22:47 +0100698=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100699
700=back
701
702=item B<Connexor>
703
704=over 4
705
Akronf7ad89e2016-03-16 18:22:47 +0100706=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100707
Akronf7ad89e2016-03-16 18:22:47 +0100708=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100709
Akronf7ad89e2016-03-16 18:22:47 +0100710=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100711
Akronf7ad89e2016-03-16 18:22:47 +0100712=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100713
714=back
715
716=item B<CoreNLP>
717
718=over 4
719
Akronf7ad89e2016-03-16 18:22:47 +0100720=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100721
Akronf7ad89e2016-03-16 18:22:47 +0100722=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100723
Akronf7ad89e2016-03-16 18:22:47 +0100724=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100725
Akronf7ad89e2016-03-16 18:22:47 +0100726=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100727
728=back
729
730=item B<DeReKo>
731
732=over 4
733
Akronf7ad89e2016-03-16 18:22:47 +0100734=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100735
736=back
737
738=item B<Glemm>
739
740=over 4
741
Akronf7ad89e2016-03-16 18:22:47 +0100742=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100743
744=back
745
746=item B<Mate>
747
748=over 4
749
Akronf7ad89e2016-03-16 18:22:47 +0100750=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100751
Akronf7ad89e2016-03-16 18:22:47 +0100752=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100753
754=back
755
756=item B<OpenNLP>
757
758=over 4
759
Akronf7ad89e2016-03-16 18:22:47 +0100760=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100761
Akronf7ad89e2016-03-16 18:22:47 +0100762=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100763
764=back
765
766=item B<Sgbr>
767
768=over 4
769
Akronf7ad89e2016-03-16 18:22:47 +0100770=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100771
Akronf7ad89e2016-03-16 18:22:47 +0100772=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100773
774=back
775
776=item B<TreeTagger>
777
778=over 4
779
Akronf7ad89e2016-03-16 18:22:47 +0100780=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100781
Akronf7ad89e2016-03-16 18:22:47 +0100782=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100783
784=back
785
786=item B<XIP>
787
788=over 4
789
Akronf7ad89e2016-03-16 18:22:47 +0100790=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100791
Akronf7ad89e2016-03-16 18:22:47 +0100792=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100793
Akronf7ad89e2016-03-16 18:22:47 +0100794=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100795
796=back
797
798=back
799
800More importers are in preparation.
801New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
802See the built-in annotation importers as examples.
803
Akron941c1a62016-02-23 17:41:41 +0100804=head1 AVAILABILITY
805
806 https://github.com/KorAP/KorAP-XML-Krill
807
808
809=head1 COPYRIGHT AND LICENSE
810
811Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100812
Akron941c1a62016-02-23 17:41:41 +0100813Author: L<Nils Diewald|http://nils-diewald.de/>
814
815L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
816Corpus Analysis Platform at the
817L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
818member of the
819L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
820
821This program is free software published under the
822L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
823
824=cut