blob: af755385368af245d7a316a2c250dcd8d3275b1d [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron405f0c52016-07-07 17:56:16 +020017use KorAP::XML::ProcessFile;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron941c1a62016-02-23 17:41:41 +010066# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010067
Akron8b990522016-07-06 16:45:57 +020068our $LAST_CHANGE = '2016/07/06';
Akron941c1a62016-02-23 17:41:41 +010069our $LOCAL = $FindBin::Bin;
70our $VERSION_MSG = <<"VERSION";
71Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
72VERSION
73
74
75# Parse comand
76my $cmd;
77our @ARGV;
78if ($ARGV[0] && index($ARGV[0], '-') != 0) {
79 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010080};
Akron93d620e2016-02-05 19:40:05 +010081
Akron08385f62016-03-22 20:37:04 +010082my (@skip, @sigle, @input);
Akron35db6e32016-03-17 22:42:22 +010083my $text;
Akrone10ad322016-02-27 10:54:26 +010084
Akron941c1a62016-02-23 17:41:41 +010085# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000086GetOptions(
Akron08385f62016-03-22 20:37:04 +010087 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010088 'output|o=s' => \(my $output),
89 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010090 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010091 'token|t=s' => \(my $token_base),
92 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010093 'skip|s=s' => \@skip,
94 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010095 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
96 'cache-size|cs=s' => \(my $cache_size = '50m'),
97 'cache-delete|cd!' => \(my $cache_delete = 1),
98 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010099 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +0100100 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +0100101 'primary|p!' => \(my $primary),
102 'pretty|y' => \(my $pretty),
103 'jobs|j=i' => \(my $jobs = 0),
104 'help|h' => sub {
105 pod2usage(
106 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
107 -verbose => 99,
108 -msg => $VERSION_MSG,
109 );
110 },
111 'version|v' => sub {
112 pod2usage(
113 -verbose => 0,
114 -msg => $VERSION_MSG
115 )
116 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000117);
118
Akron941c1a62016-02-23 17:41:41 +0100119my %ERROR_HASH = (
120 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
121 -verbose => 99,
122 -msg => $VERSION_MSG,
123 -exit => 1
124);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000125
Akron941c1a62016-02-23 17:41:41 +0100126# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100127pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000128
Nils Diewald7364d1f2013-11-05 19:26:35 +0000129
Akron941c1a62016-02-23 17:41:41 +0100130# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000131Log::Log4perl->init({
132 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
133 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
134 'log4perl.appender.STDERR.layout' => 'PatternLayout',
135 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
136});
137
138my $log = Log::Log4perl->get_logger('main');
139
Akron941c1a62016-02-23 17:41:41 +0100140
141# Get file name based on path information
142sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100143 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100144 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200145 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100146 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100147 $file =~ tr/\//-/;
148 $file =~ s{^-+}{};
149 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000150};
151
Akron941c1a62016-02-23 17:41:41 +0100152
153# Write file
154sub write_file {
155 my $anno = shift;
156 my $file = get_file_name $anno;
157
158 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
159
Akron08385f62016-03-22 20:37:04 +0100160 my $call = 'perl ' . $LOCAL . '/korapxml2krill';
161 $call .= ' -i ' . $anno;
162 $call .= ' -o ' . $output . '/' . $file . '.json';
Akron941c1a62016-02-23 17:41:41 +0100163 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100164 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100165 $call .= ' -w' if $overwrite;
166 $call .= ' -t ' . $token_base if $token_base;
167 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100168 $call .= ' -c ' . $cache_file;
169 $call .= ' -cs ' . $cache_size;
170 $call .= ' --no-cache-delete'; # Don't delete the cache
171 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100172 $call .= ' --no-primary ' if $primary;
173 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100174 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100175 $call .= ' -s ' . $_ foreach @skip;
176 system($call);
177 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000178};
179
Nils Diewald2db9ad02013-10-29 19:26:43 +0000180
Akrone10ad322016-02-27 10:54:26 +0100181# Convert sigle to path construct
182s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
183
Akron941c1a62016-02-23 17:41:41 +0100184# Process a single file
185unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100186 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000187
Akron941c1a62016-02-23 17:41:41 +0100188 # Can't print gzip to STDOUT
189 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000190
Akron941c1a62016-02-23 17:41:41 +0100191 my %skip;
192 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000193
Akron941c1a62016-02-23 17:41:41 +0100194 # Ignore processing
195 if (!$overwrite && $output && -e $output) {
196 $log->trace($output . ' already exists');
197 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000198 };
Akron941c1a62016-02-23 17:41:41 +0100199
200 BEGIN {
201 $main::TIME = Benchmark->new;
202 $main::LAST_STOP = Benchmark->new;
203 };
204
205 sub stop_time {
206 my $new = Benchmark->new;
207 $log->trace(
208 'The code took: '.
209 timestr(timediff($new, $main::LAST_STOP)) .
210 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
211 );
212 $main::LAST_STOP = $new;
213 };
214
215 # Create and parse new document
216 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100217 my $doc = KorAP::XML::Krill->new(
218 path => $input,
Akron11c80302016-03-18 19:44:43 +0100219 meta_type => ($meta // 'I5'),
220 cache => Cache::FastMmap->new(
221 share_file => $cache_file,
222 cache_size => $cache_size,
223 init_file => $cache_init
224 )
Akron35db6e32016-03-17 22:42:22 +0100225 );
Akron941c1a62016-02-23 17:41:41 +0100226
227 unless ($doc->parse) {
228 $log->warn($output . " can't be processed - no document data");
229 exit(0);
230 };
231
232 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
233 if ($token_base) {
234 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
235 };
236
237 # Get tokenization
238 my $tokens = KorAP::XML::Tokenizer->new(
239 path => $doc->path,
240 doc => $doc,
241 foundry => $token_base_foundry,
242 layer => $token_base_layer,
243 name => 'tokens'
244 );
245
246 # Unable to process base tokenization
247 unless ($tokens->parse) {
248 $log->error($output . " can't be processed - no base tokenization");
249 exit(0);
250 };
251
252 my @layers;
253 push(@layers, ['Base', 'Sentences']);
254 push(@layers, ['Base', 'Paragraphs']);
255
256 # Connexor
257 push(@layers, ['Connexor', 'Morpho']);
258 push(@layers, ['Connexor', 'Syntax']);
259 push(@layers, ['Connexor', 'Phrase']);
260 push(@layers, ['Connexor', 'Sentences']);
261
262 # CoreNLP
263 push(@layers, ['CoreNLP', 'NamedEntities']);
264 push(@layers, ['CoreNLP', 'Sentences']);
265 push(@layers, ['CoreNLP', 'Morpho']);
266 push(@layers, ['CoreNLP', 'Constituency']);
267
268 # DeReKo
269 push(@layers, ['DeReKo', 'Structure']);
270
271 # Glemm
272 push(@layers, ['Glemm', 'Morpho']);
273
274 # Malt
Akronf3f0c942016-06-27 13:27:14 +0200275 push(@layers, ['Malt', 'Dependency']);
Akron941c1a62016-02-23 17:41:41 +0100276
Akron8b990522016-07-06 16:45:57 +0200277 # MDParser
278 push(@layers, ['MDParser', 'Dependency']);
279
Akron941c1a62016-02-23 17:41:41 +0100280 # Mate
281 push(@layers, ['Mate', 'Morpho']);
282 push(@layers, ['Mate', 'Dependency']);
283
284 # OpenNLP
285 push(@layers, ['OpenNLP', 'Morpho']);
286 push(@layers, ['OpenNLP', 'Sentences']);
287
288 # Schreibgebrauch
289 push(@layers, ['Sgbr', 'Lemma']);
290 push(@layers, ['Sgbr', 'Morpho']);
291
292 # TreeTagger
293 push(@layers, ['TreeTagger', 'Morpho']);
294 push(@layers, ['TreeTagger', 'Sentences']);
295
296 # XIP
297 push(@layers, ['XIP', 'Morpho']);
298 push(@layers, ['XIP', 'Constituency']);
299 push(@layers, ['XIP', 'Sentences']);
300 push(@layers, ['XIP', 'Dependency']);
301
302
303 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100304 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100305 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000306 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000307 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000308 }
309 else {
Akron941c1a62016-02-23 17:41:41 +0100310 # Add to index file - respect skipping
311 foreach my $info (@layers) {
312 # Skip if Foundry or Foundry#Layer should be skipped
313 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
314 $tokens->add(@$info);
315 stop_time;
316 };
317 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000318 };
319
Akron941c1a62016-02-23 17:41:41 +0100320 my $file;
Akron35db6e32016-03-17 22:42:22 +0100321 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100322
323 if ($output) {
324
325 if ($gzip) {
326 $file = IO::Compress::Gzip->new($output, Minimal => 1);
327 }
328 else {
329 $file = IO::File->new($output, "w");
330 };
331
332 $file->print($print_text);
333 $file->close;
334 }
335
336 else {
337 print $print_text . "\n";
338 };
339
Akron11c80302016-03-18 19:44:43 +0100340 # Delete cache file
341 unlink($cache_file) if $cache_delete;
342
Akron941c1a62016-02-23 17:41:41 +0100343 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000344}
Nils Diewald59094f22014-11-05 18:20:50 +0000345
Akrone10ad322016-02-27 10:54:26 +0100346# Extract XML files
347elsif ($cmd eq 'extract') {
348
349 pod2usage(%ERROR_HASH) unless $output;
350
Akrone10ad322016-02-27 10:54:26 +0100351 if ($output && (!-e $output || !-d $output)) {
352 print "Directory '$output' does not exist.\n\n";
353 exit(0);
354 };
355
Akronb0c88db2016-06-29 16:33:18 +0200356 # TODO: Support sigles and full archives
Akron08385f62016-03-22 20:37:04 +0100357
Akronb0c88db2016-06-29 16:33:18 +0200358 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100359
360 unless ($archive->test_unzip) {
361 print "Unzip is not installed or incompatible.\n\n";
362 exit(1);
363 };
364
Akronb0c88db2016-06-29 16:33:18 +0200365 # Add further annotation archived
366 $archive->attach($_) foreach @input;
367
Akrone10ad322016-02-27 10:54:26 +0100368 # Iterate over all given sigles and extract
369 foreach (@sigle) {
370 print "$_ ";
Akronb0c88db2016-06-29 16:33:18 +0200371 print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
Akrone10ad322016-02-27 10:54:26 +0100372 print "extracted.\n";
373 };
374
375 print "\n";
376 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200377 }
378 else {
379 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100380 };
381}
382
Akron941c1a62016-02-23 17:41:41 +0100383# Process an archive
384elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000385
Akrone10ad322016-02-27 10:54:26 +0100386 # TODO: Support sigles
387
Akron941c1a62016-02-23 17:41:41 +0100388 pod2usage(%ERROR_HASH) unless $output;
389
390 if ($output && (!-e $output || !-d $output)) {
391 print "Directory '$output' does not exist.\n\n";
392 exit(0);
393 };
394
395 # Zero means: everything runs in the parent process
396 my $pool = Parallel::ForkManager->new($jobs);
397
398 my $count = 0; # Texts to process
399 my $iter = 1; # Current text in process
400
401 # Report on fork message
402 $pool->run_on_finish (
403 sub {
404 my ($pid, $code) = shift;
405 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100406 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100407 ($iter++) . "/$count]" .
408 ($code ? " $code" : '') .
409 " $$data\n";
410 }
411 );
412
413 my $t;
414 print "Reading data ...\n";
415
Akron11c80302016-03-18 19:44:43 +0100416 unless (Cache::FastMmap->new(
417 share_file => $cache_file,
418 cache_size => $cache_size,
419 init_file => $cache_init
420 )) {
421 print "Unable to intialize cache '$cache_file'\n\n";
422 exit(1);
423 };
424
Akron941c1a62016-02-23 17:41:41 +0100425 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100426 if (-d $input[0]) {
427 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100428 my @dirs;
429 my $dir;
430
431 while (1) {
432 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
433 push @dirs, $dir;
434 $it->prune;
435 };
436 last unless $it->next;
437 };
438
439 print "Start processing ...\n";
440 $t = Benchmark->new;
441 $count = scalar @dirs;
442
443 DIRECTORY_LOOP:
444 for (my $i = 0; $i < $count; $i++) {
445
446 unless ($overwrite) {
447 my $filename = catfile(
448 $output,
449 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
450 );
451
452 if (-e $filename) {
453 $iter++;
454 print "Skip $filename\n";
455 next;
456 };
457 };
458
459 # Get the next fork
460 my $pid = $pool->start and next DIRECTORY_LOOP;
461 my $msg;
462
463 $msg = write_file($dirs[$i]);
464 $pool->finish(0, \$msg);
465 };
466 }
467
468 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200469 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron941c1a62016-02-23 17:41:41 +0100470 unless ($archive->test_unzip) {
471 print "Unzip is not installed or incompatible.\n\n";
472 exit(1);
473 };
474
Akron08385f62016-03-22 20:37:04 +0100475 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200476 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100477
Akron941c1a62016-02-23 17:41:41 +0100478 print "Start processing ...\n";
479 $t = Benchmark->new;
480 my @dirs = $archive->list_texts;
481 $count = scalar @dirs;
482
483 ARCHIVE_LOOP:
484 for (my $i = 0; $i < $count; $i++) {
485
486 # Split path information
487 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
488
489 unless ($overwrite) {
Akron62557602016-06-27 14:10:13 +0200490
491 # This is not correct!!
Akron941c1a62016-02-23 17:41:41 +0100492 my $filename = catfile(
493 $output,
Akron62557602016-06-27 14:10:13 +0200494 get_file_name(
495 catfile($corpus, $doc, $text)
496 . '.json' . ($gzip ? '.gz' : '')
497 )
Akron941c1a62016-02-23 17:41:41 +0100498 );
499
500 if (-e $filename) {
501 $iter++;
502 print "Skip $filename\n";
503 next;
504 };
505 };
506
507 # Get the next fork
508 my $pid = $pool->start and next ARCHIVE_LOOP;
509
510 # Create temporary file
511 my $temp = File::Temp->newdir;
512
513 my $msg;
514
515 # Extract from archive
516 if ($archive->extract($dirs[$i], $temp)) {
517
518 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100519 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100520
521 # Temporary directory
522 my $dir = catdir($input, $doc, $text);
523
524 # Write file
525 $msg = write_file($dir);
526
527 $temp = undef;
528 $pool->finish(0, \$msg);
529 }
530 else {
531
532 $temp = undef;
533 $msg = "Unable to extract " . $dirs[$i] . "\n";
534 $pool->finish(1, \$msg);
535 };
536 };
537 }
538
539 else {
540 print "Input is neither a directory nor an archive.\n\n";
541 };
542
543 $pool->wait_all_children;
544
Akron11c80302016-03-18 19:44:43 +0100545 # Delete cache file
546 unlink($cache_file) if $cache_delete;
547
Akron941c1a62016-02-23 17:41:41 +0100548 print "Done.\n";
549 print timestr(timediff(Benchmark->new, $t))."\n\n";
550}
551
552# Unknown command
553else {
554 warn "Unknown command '$cmd'.\n\n";
555 pod2usage(%ERROR_HASH);
556}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000557
558__END__
Akron941c1a62016-02-23 17:41:41 +0100559
560=pod
561
562=encoding utf8
563
564=head1 NAME
565
Akronf7ad89e2016-03-16 18:22:47 +0100566korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100567
568
569=head1 SYNOPSIS
570
Akronc13a1702016-03-15 19:33:14 +0100571 $ korapxml2krill -z --input <directory> --output <filename>
572 $ korapxml2krill archive -z --input <directory> --output <directory>
573 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100574
575
576=head1 DESCRIPTION
577
578L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
579compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100580The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100581
582
583=head1 INSTALLATION
584
585The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
586
587 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
588
Akronc13a1702016-03-15 19:33:14 +0100589In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100590be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100591
592
593=head1 ARGUMENTS
594
595=over 2
596
597=item B<archive>
598
Akrone10ad322016-02-27 10:54:26 +0100599Process an archive as a Zip-file or a folder of KorAP-XML documents.
600
601=item B<extract>
602
603Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100604
605=back
606
607
608=head1 OPTIONS
609
610=over 2
611
Akron2cfe8092016-06-24 17:48:49 +0200612=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100613
Akronf7ad89e2016-03-16 18:22:47 +0100614Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100615
Akron0c3e3752016-06-28 15:55:53 +0200616Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200617that the first archive listed contains all primary data files
618and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200619
620 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
621
Akron0c3e3752016-06-28 15:55:53 +0200622(The directory structure follows the base directory format,
623that may include a C<.> root folder.
624In this case further archives lacking a C<.> root folder
625need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200626
Akron941c1a62016-02-23 17:41:41 +0100627=item B<--output|-o> <directory|file>
628
629Output folder for archive processing or
630document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100631writes to C<STDOUT> by default
632(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100633
634=item B<--overwrite|-w>
635
636Overwrite files that already exist.
637
638=item B<--token|-t> <foundry>[#<file>]
639
640Define the default tokenization by specifying
641the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100642of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100643
644=item B<--skip|-s> <foundry>[#<layer>]
645
Akronf7ad89e2016-03-16 18:22:47 +0100646Skip specific annotations by specifying the foundry
647(and optionally the layer with a C<#>-prefix),
648e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100649Can be set multiple times.
650
Akronc13a1702016-03-15 19:33:14 +0100651=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100652
Akronf7ad89e2016-03-16 18:22:47 +0100653Convert specific annotations by specifying the foundry
654(and optionally the layer with a C<#>-prefix),
655e.g. C<Mate> or C<Mate#Morpho>.
656Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100657
658=item B<--primary|-p>
659
Akronc13a1702016-03-15 19:33:14 +0100660Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100661Can be flagged using C<--no-primary> as well.
662This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100663
664=item B<--jobs|-j>
665
666Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100667for archive processing.
Akron11c80302016-03-18 19:44:43 +0100668Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100669This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100670
Akron35db6e32016-03-17 22:42:22 +0100671=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100672
Akron35db6e32016-03-17 22:42:22 +0100673Define the metadata parser to use. Defaults to C<I5>.
674Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
675This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100676
677=item B<--pretty|-y>
678
Akronc13a1702016-03-15 19:33:14 +0100679Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100680This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100681
682=item B<--gzip|-z>
683
Akronf7ad89e2016-03-16 18:22:47 +0100684Compress the output.
685Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100686
Akron11c80302016-03-18 19:44:43 +0100687=item B<--cache|-c>
688
689File to mmap a cache (using L<Cache::FastMmap>).
690Defaults to C<korapxml2krill.cache> in the calling directory.
691
692=item B<--cache-size|-cs>
693
694Size of the cache. Defaults to C<50m>.
695
696=item B<--cache-init|-ci>
697
698Initialize cache file.
699Can be flagged using C<--no-cache-init> as well.
700Defaults to C<true>.
701
702=item B<--cache-delete|-cd>
703
704Delete cache file after processing.
705Can be flagged using C<--no-cache-delete> as well.
706Defaults to C<true>.
707
Akrone10ad322016-02-27 10:54:26 +0100708=item B<--sigle|-sg>
709
710Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100711Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100712I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200713Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100714
Akron941c1a62016-02-23 17:41:41 +0100715=item B<--log|-l>
716
717The L<Log4perl> log level, defaults to C<ERROR>.
718
719=item B<--help|-h>
720
721Print this document.
722
723=item B<--version|-v>
724
725Print version information.
726
727=back
728
Akronc13a1702016-03-15 19:33:14 +0100729=head1 ANNOTATION SUPPORT
730
731L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
732developed in the KorAP project that are part of the KorAP preprocessing pipeline.
733The base foundry with paragraphs, sentences, and the text element are mandatory for
734L<Krill|https://github.com/KorAP/Krill>.
735
Akronf7ad89e2016-03-16 18:22:47 +0100736=over 2
Akronc13a1702016-03-15 19:33:14 +0100737
738=item B<Base>
739
740=over 4
741
Akronf7ad89e2016-03-16 18:22:47 +0100742=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100743
Akronf7ad89e2016-03-16 18:22:47 +0100744=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100745
746=back
747
748=item B<Connexor>
749
750=over 4
751
Akronf7ad89e2016-03-16 18:22:47 +0100752=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100753
Akronf7ad89e2016-03-16 18:22:47 +0100754=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100755
Akronf7ad89e2016-03-16 18:22:47 +0100756=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100757
Akronf7ad89e2016-03-16 18:22:47 +0100758=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100759
760=back
761
762=item B<CoreNLP>
763
764=over 4
765
Akronf7ad89e2016-03-16 18:22:47 +0100766=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100767
Akronf7ad89e2016-03-16 18:22:47 +0100768=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100769
Akronf7ad89e2016-03-16 18:22:47 +0100770=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100771
Akronf7ad89e2016-03-16 18:22:47 +0100772=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100773
774=back
775
776=item B<DeReKo>
777
778=over 4
779
Akronf7ad89e2016-03-16 18:22:47 +0100780=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100781
782=back
783
784=item B<Glemm>
785
786=over 4
787
Akronf7ad89e2016-03-16 18:22:47 +0100788=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100789
790=back
791
792=item B<Mate>
793
794=over 4
795
Akronf7ad89e2016-03-16 18:22:47 +0100796=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100797
Akronf7ad89e2016-03-16 18:22:47 +0100798=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100799
800=back
801
802=item B<OpenNLP>
803
804=over 4
805
Akronf7ad89e2016-03-16 18:22:47 +0100806=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100807
Akronf7ad89e2016-03-16 18:22:47 +0100808=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100809
810=back
811
812=item B<Sgbr>
813
814=over 4
815
Akronf7ad89e2016-03-16 18:22:47 +0100816=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100817
Akronf7ad89e2016-03-16 18:22:47 +0100818=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100819
820=back
821
822=item B<TreeTagger>
823
824=over 4
825
Akronf7ad89e2016-03-16 18:22:47 +0100826=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100827
Akronf7ad89e2016-03-16 18:22:47 +0100828=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100829
830=back
831
832=item B<XIP>
833
834=over 4
835
Akronf7ad89e2016-03-16 18:22:47 +0100836=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100837
Akronf7ad89e2016-03-16 18:22:47 +0100838=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100839
Akronf7ad89e2016-03-16 18:22:47 +0100840=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100841
842=back
843
844=back
845
846More importers are in preparation.
847New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
848See the built-in annotation importers as examples.
849
Akron941c1a62016-02-23 17:41:41 +0100850=head1 AVAILABILITY
851
852 https://github.com/KorAP/KorAP-XML-Krill
853
854
855=head1 COPYRIGHT AND LICENSE
856
857Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100858
Akron941c1a62016-02-23 17:41:41 +0100859Author: L<Nils Diewald|http://nils-diewald.de/>
860
861L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
862Corpus Analysis Platform at the
863L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
864member of the
865L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
866
867This program is free software published under the
868L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
869
870=cut