blob: 8a56858b2f1f8bae1785ace9129b5218c47104b0 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010019# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010020
Akron941c1a62016-02-23 17:41:41 +010021# CHANGES:
22# ----------------------------------------------------------
23# 2013/11/25
24# - Initial release
25#
26# 2014/10/29
27# - Merges foundry data to create indexer friendly documents
28#
Akron93d620e2016-02-05 19:40:05 +010029# 2016/02/04
30# - renamed to korapxml2krill
31# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010032#
33# 2016/02/12
34# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010035# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010036#
37# 2016/02/14
38# - Added version information
Akron941c1a62016-02-23 17:41:41 +010039# - Added support for archive files
40#
41# 2016/02/15
42# - Fixed temporary directory bug
43# - Improved skipping before unzipping
44# - Added EXPERIMENTAL concurrency support
45#
46# 2016/02/23
47# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010048#
49# 2016/02/27
50# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010051#
52# 2016/03/17
53# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010054#
55# 2016/03/18
56# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020057#
Akronf3f0c942016-06-27 13:27:14 +020058# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020059# - Added multi archive support
60# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020061# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020062#
63# 2016/07/06
64# - Added MDParser#Dependency
Akron941c1a62016-02-23 17:41:41 +010065# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010066
Akron8b990522016-07-06 16:45:57 +020067our $LAST_CHANGE = '2016/07/06';
Akron941c1a62016-02-23 17:41:41 +010068our $LOCAL = $FindBin::Bin;
69our $VERSION_MSG = <<"VERSION";
70Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
71VERSION
72
73
74# Parse comand
75my $cmd;
76our @ARGV;
77if ($ARGV[0] && index($ARGV[0], '-') != 0) {
78 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010079};
Akron93d620e2016-02-05 19:40:05 +010080
Akron08385f62016-03-22 20:37:04 +010081my (@skip, @sigle, @input);
Akron35db6e32016-03-17 22:42:22 +010082my $text;
Akrone10ad322016-02-27 10:54:26 +010083
Akron941c1a62016-02-23 17:41:41 +010084# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000085GetOptions(
Akron08385f62016-03-22 20:37:04 +010086 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010087 'output|o=s' => \(my $output),
88 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010089 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010090 'token|t=s' => \(my $token_base),
91 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010092 'skip|s=s' => \@skip,
93 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010094 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
95 'cache-size|cs=s' => \(my $cache_size = '50m'),
96 'cache-delete|cd!' => \(my $cache_delete = 1),
97 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010098 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010099 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +0100100 'primary|p!' => \(my $primary),
101 'pretty|y' => \(my $pretty),
102 'jobs|j=i' => \(my $jobs = 0),
103 'help|h' => sub {
104 pod2usage(
105 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
106 -verbose => 99,
107 -msg => $VERSION_MSG,
108 );
109 },
110 'version|v' => sub {
111 pod2usage(
112 -verbose => 0,
113 -msg => $VERSION_MSG
114 )
115 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000116);
117
Akron941c1a62016-02-23 17:41:41 +0100118my %ERROR_HASH = (
119 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
120 -verbose => 99,
121 -msg => $VERSION_MSG,
122 -exit => 1
123);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000124
Akron941c1a62016-02-23 17:41:41 +0100125# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100126pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127
Nils Diewald7364d1f2013-11-05 19:26:35 +0000128
Akron941c1a62016-02-23 17:41:41 +0100129# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000130Log::Log4perl->init({
131 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
132 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
133 'log4perl.appender.STDERR.layout' => 'PatternLayout',
134 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
135});
136
137my $log = Log::Log4perl->get_logger('main');
138
Akron941c1a62016-02-23 17:41:41 +0100139
140# Get file name based on path information
141sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100142 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100143 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200144 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100145 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100146 $file =~ tr/\//-/;
147 $file =~ s{^-+}{};
148 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000149};
150
Akron941c1a62016-02-23 17:41:41 +0100151
152# Write file
153sub write_file {
154 my $anno = shift;
155 my $file = get_file_name $anno;
156
157 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
158
Akron08385f62016-03-22 20:37:04 +0100159 my $call = 'perl ' . $LOCAL . '/korapxml2krill';
160 $call .= ' -i ' . $anno;
161 $call .= ' -o ' . $output . '/' . $file . '.json';
Akron941c1a62016-02-23 17:41:41 +0100162 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100163 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100164 $call .= ' -w' if $overwrite;
165 $call .= ' -t ' . $token_base if $token_base;
166 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100167 $call .= ' -c ' . $cache_file;
168 $call .= ' -cs ' . $cache_size;
169 $call .= ' --no-cache-delete'; # Don't delete the cache
170 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100171 $call .= ' --no-primary ' if $primary;
172 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100173 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100174 $call .= ' -s ' . $_ foreach @skip;
175 system($call);
176 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000177};
178
Nils Diewald2db9ad02013-10-29 19:26:43 +0000179
Akrone10ad322016-02-27 10:54:26 +0100180# Convert sigle to path construct
181s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
182
Akron941c1a62016-02-23 17:41:41 +0100183# Process a single file
184unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100185 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000186
Akron941c1a62016-02-23 17:41:41 +0100187 # Can't print gzip to STDOUT
188 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000189
Akron941c1a62016-02-23 17:41:41 +0100190 my %skip;
191 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000192
Akron941c1a62016-02-23 17:41:41 +0100193 # Ignore processing
194 if (!$overwrite && $output && -e $output) {
195 $log->trace($output . ' already exists');
196 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000197 };
Akron941c1a62016-02-23 17:41:41 +0100198
199 BEGIN {
200 $main::TIME = Benchmark->new;
201 $main::LAST_STOP = Benchmark->new;
202 };
203
204 sub stop_time {
205 my $new = Benchmark->new;
206 $log->trace(
207 'The code took: '.
208 timestr(timediff($new, $main::LAST_STOP)) .
209 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
210 );
211 $main::LAST_STOP = $new;
212 };
213
214 # Create and parse new document
215 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100216 my $doc = KorAP::XML::Krill->new(
217 path => $input,
Akron11c80302016-03-18 19:44:43 +0100218 meta_type => ($meta // 'I5'),
219 cache => Cache::FastMmap->new(
220 share_file => $cache_file,
221 cache_size => $cache_size,
222 init_file => $cache_init
223 )
Akron35db6e32016-03-17 22:42:22 +0100224 );
Akron941c1a62016-02-23 17:41:41 +0100225
226 unless ($doc->parse) {
227 $log->warn($output . " can't be processed - no document data");
228 exit(0);
229 };
230
231 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
232 if ($token_base) {
233 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
234 };
235
236 # Get tokenization
237 my $tokens = KorAP::XML::Tokenizer->new(
238 path => $doc->path,
239 doc => $doc,
240 foundry => $token_base_foundry,
241 layer => $token_base_layer,
242 name => 'tokens'
243 );
244
245 # Unable to process base tokenization
246 unless ($tokens->parse) {
247 $log->error($output . " can't be processed - no base tokenization");
248 exit(0);
249 };
250
251 my @layers;
252 push(@layers, ['Base', 'Sentences']);
253 push(@layers, ['Base', 'Paragraphs']);
254
255 # Connexor
256 push(@layers, ['Connexor', 'Morpho']);
257 push(@layers, ['Connexor', 'Syntax']);
258 push(@layers, ['Connexor', 'Phrase']);
259 push(@layers, ['Connexor', 'Sentences']);
260
261 # CoreNLP
262 push(@layers, ['CoreNLP', 'NamedEntities']);
263 push(@layers, ['CoreNLP', 'Sentences']);
264 push(@layers, ['CoreNLP', 'Morpho']);
265 push(@layers, ['CoreNLP', 'Constituency']);
266
267 # DeReKo
268 push(@layers, ['DeReKo', 'Structure']);
269
270 # Glemm
271 push(@layers, ['Glemm', 'Morpho']);
272
273 # Malt
Akronf3f0c942016-06-27 13:27:14 +0200274 push(@layers, ['Malt', 'Dependency']);
Akron941c1a62016-02-23 17:41:41 +0100275
Akron8b990522016-07-06 16:45:57 +0200276 # MDParser
277 push(@layers, ['MDParser', 'Dependency']);
278
Akron941c1a62016-02-23 17:41:41 +0100279 # Mate
280 push(@layers, ['Mate', 'Morpho']);
281 push(@layers, ['Mate', 'Dependency']);
282
283 # OpenNLP
284 push(@layers, ['OpenNLP', 'Morpho']);
285 push(@layers, ['OpenNLP', 'Sentences']);
286
287 # Schreibgebrauch
288 push(@layers, ['Sgbr', 'Lemma']);
289 push(@layers, ['Sgbr', 'Morpho']);
290
291 # TreeTagger
292 push(@layers, ['TreeTagger', 'Morpho']);
293 push(@layers, ['TreeTagger', 'Sentences']);
294
295 # XIP
296 push(@layers, ['XIP', 'Morpho']);
297 push(@layers, ['XIP', 'Constituency']);
298 push(@layers, ['XIP', 'Sentences']);
299 push(@layers, ['XIP', 'Dependency']);
300
301
302 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100303 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100304 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000305 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000306 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000307 }
308 else {
Akron941c1a62016-02-23 17:41:41 +0100309 # Add to index file - respect skipping
310 foreach my $info (@layers) {
311 # Skip if Foundry or Foundry#Layer should be skipped
312 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
313 $tokens->add(@$info);
314 stop_time;
315 };
316 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000317 };
318
Akron941c1a62016-02-23 17:41:41 +0100319 my $file;
Akron35db6e32016-03-17 22:42:22 +0100320 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100321
322 if ($output) {
323
324 if ($gzip) {
325 $file = IO::Compress::Gzip->new($output, Minimal => 1);
326 }
327 else {
328 $file = IO::File->new($output, "w");
329 };
330
331 $file->print($print_text);
332 $file->close;
333 }
334
335 else {
336 print $print_text . "\n";
337 };
338
Akron11c80302016-03-18 19:44:43 +0100339 # Delete cache file
340 unlink($cache_file) if $cache_delete;
341
Akron941c1a62016-02-23 17:41:41 +0100342 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000343}
Nils Diewald59094f22014-11-05 18:20:50 +0000344
Akrone10ad322016-02-27 10:54:26 +0100345# Extract XML files
346elsif ($cmd eq 'extract') {
347
348 pod2usage(%ERROR_HASH) unless $output;
349
Akrone10ad322016-02-27 10:54:26 +0100350 if ($output && (!-e $output || !-d $output)) {
351 print "Directory '$output' does not exist.\n\n";
352 exit(0);
353 };
354
Akronb0c88db2016-06-29 16:33:18 +0200355 # TODO: Support sigles and full archives
Akron08385f62016-03-22 20:37:04 +0100356
Akronb0c88db2016-06-29 16:33:18 +0200357 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100358
359 unless ($archive->test_unzip) {
360 print "Unzip is not installed or incompatible.\n\n";
361 exit(1);
362 };
363
Akronb0c88db2016-06-29 16:33:18 +0200364 # Add further annotation archived
365 $archive->attach($_) foreach @input;
366
Akrone10ad322016-02-27 10:54:26 +0100367 # Iterate over all given sigles and extract
368 foreach (@sigle) {
369 print "$_ ";
Akronb0c88db2016-06-29 16:33:18 +0200370 print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
Akrone10ad322016-02-27 10:54:26 +0100371 print "extracted.\n";
372 };
373
374 print "\n";
375 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200376 }
377 else {
378 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100379 };
380}
381
Akron941c1a62016-02-23 17:41:41 +0100382# Process an archive
383elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000384
Akrone10ad322016-02-27 10:54:26 +0100385 # TODO: Support sigles
386
Akron941c1a62016-02-23 17:41:41 +0100387 pod2usage(%ERROR_HASH) unless $output;
388
389 if ($output && (!-e $output || !-d $output)) {
390 print "Directory '$output' does not exist.\n\n";
391 exit(0);
392 };
393
394 # Zero means: everything runs in the parent process
395 my $pool = Parallel::ForkManager->new($jobs);
396
397 my $count = 0; # Texts to process
398 my $iter = 1; # Current text in process
399
400 # Report on fork message
401 $pool->run_on_finish (
402 sub {
403 my ($pid, $code) = shift;
404 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100405 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100406 ($iter++) . "/$count]" .
407 ($code ? " $code" : '') .
408 " $$data\n";
409 }
410 );
411
412 my $t;
413 print "Reading data ...\n";
414
Akron11c80302016-03-18 19:44:43 +0100415 unless (Cache::FastMmap->new(
416 share_file => $cache_file,
417 cache_size => $cache_size,
418 init_file => $cache_init
419 )) {
420 print "Unable to intialize cache '$cache_file'\n\n";
421 exit(1);
422 };
423
Akron941c1a62016-02-23 17:41:41 +0100424 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100425 if (-d $input[0]) {
426 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100427 my @dirs;
428 my $dir;
429
430 while (1) {
431 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
432 push @dirs, $dir;
433 $it->prune;
434 };
435 last unless $it->next;
436 };
437
438 print "Start processing ...\n";
439 $t = Benchmark->new;
440 $count = scalar @dirs;
441
442 DIRECTORY_LOOP:
443 for (my $i = 0; $i < $count; $i++) {
444
445 unless ($overwrite) {
446 my $filename = catfile(
447 $output,
448 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
449 );
450
451 if (-e $filename) {
452 $iter++;
453 print "Skip $filename\n";
454 next;
455 };
456 };
457
458 # Get the next fork
459 my $pid = $pool->start and next DIRECTORY_LOOP;
460 my $msg;
461
462 $msg = write_file($dirs[$i]);
463 $pool->finish(0, \$msg);
464 };
465 }
466
467 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200468 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron941c1a62016-02-23 17:41:41 +0100469 unless ($archive->test_unzip) {
470 print "Unzip is not installed or incompatible.\n\n";
471 exit(1);
472 };
473
Akron08385f62016-03-22 20:37:04 +0100474 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200475 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100476
Akron941c1a62016-02-23 17:41:41 +0100477 print "Start processing ...\n";
478 $t = Benchmark->new;
479 my @dirs = $archive->list_texts;
480 $count = scalar @dirs;
481
482 ARCHIVE_LOOP:
483 for (my $i = 0; $i < $count; $i++) {
484
485 # Split path information
486 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
487
488 unless ($overwrite) {
Akron62557602016-06-27 14:10:13 +0200489
490 # This is not correct!!
Akron941c1a62016-02-23 17:41:41 +0100491 my $filename = catfile(
492 $output,
Akron62557602016-06-27 14:10:13 +0200493 get_file_name(
494 catfile($corpus, $doc, $text)
495 . '.json' . ($gzip ? '.gz' : '')
496 )
Akron941c1a62016-02-23 17:41:41 +0100497 );
498
499 if (-e $filename) {
500 $iter++;
501 print "Skip $filename\n";
502 next;
503 };
504 };
505
506 # Get the next fork
507 my $pid = $pool->start and next ARCHIVE_LOOP;
508
509 # Create temporary file
510 my $temp = File::Temp->newdir;
511
512 my $msg;
513
514 # Extract from archive
515 if ($archive->extract($dirs[$i], $temp)) {
516
517 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100518 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100519
520 # Temporary directory
521 my $dir = catdir($input, $doc, $text);
522
523 # Write file
524 $msg = write_file($dir);
525
526 $temp = undef;
527 $pool->finish(0, \$msg);
528 }
529 else {
530
531 $temp = undef;
532 $msg = "Unable to extract " . $dirs[$i] . "\n";
533 $pool->finish(1, \$msg);
534 };
535 };
536 }
537
538 else {
539 print "Input is neither a directory nor an archive.\n\n";
540 };
541
542 $pool->wait_all_children;
543
Akron11c80302016-03-18 19:44:43 +0100544 # Delete cache file
545 unlink($cache_file) if $cache_delete;
546
Akron941c1a62016-02-23 17:41:41 +0100547 print "Done.\n";
548 print timestr(timediff(Benchmark->new, $t))."\n\n";
549}
550
551# Unknown command
552else {
553 warn "Unknown command '$cmd'.\n\n";
554 pod2usage(%ERROR_HASH);
555}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000556
557__END__
Akron941c1a62016-02-23 17:41:41 +0100558
559=pod
560
561=encoding utf8
562
563=head1 NAME
564
Akronf7ad89e2016-03-16 18:22:47 +0100565korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100566
567
568=head1 SYNOPSIS
569
Akronc13a1702016-03-15 19:33:14 +0100570 $ korapxml2krill -z --input <directory> --output <filename>
571 $ korapxml2krill archive -z --input <directory> --output <directory>
572 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100573
574
575=head1 DESCRIPTION
576
577L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
578compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100579The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100580
581
582=head1 INSTALLATION
583
584The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
585
586 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
587
Akronc13a1702016-03-15 19:33:14 +0100588In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100589be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100590
591
592=head1 ARGUMENTS
593
594=over 2
595
596=item B<archive>
597
Akrone10ad322016-02-27 10:54:26 +0100598Process an archive as a Zip-file or a folder of KorAP-XML documents.
599
600=item B<extract>
601
602Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100603
604=back
605
606
607=head1 OPTIONS
608
609=over 2
610
Akron2cfe8092016-06-24 17:48:49 +0200611=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100612
Akronf7ad89e2016-03-16 18:22:47 +0100613Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100614
Akron0c3e3752016-06-28 15:55:53 +0200615Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200616that the first archive listed contains all primary data files
617and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200618
619 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
620
Akron0c3e3752016-06-28 15:55:53 +0200621(The directory structure follows the base directory format,
622that may include a C<.> root folder.
623In this case further archives lacking a C<.> root folder
624need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200625
Akron941c1a62016-02-23 17:41:41 +0100626=item B<--output|-o> <directory|file>
627
628Output folder for archive processing or
629document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100630writes to C<STDOUT> by default
631(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100632
633=item B<--overwrite|-w>
634
635Overwrite files that already exist.
636
637=item B<--token|-t> <foundry>[#<file>]
638
639Define the default tokenization by specifying
640the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100641of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100642
643=item B<--skip|-s> <foundry>[#<layer>]
644
Akronf7ad89e2016-03-16 18:22:47 +0100645Skip specific annotations by specifying the foundry
646(and optionally the layer with a C<#>-prefix),
647e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100648Can be set multiple times.
649
Akronc13a1702016-03-15 19:33:14 +0100650=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100651
Akronf7ad89e2016-03-16 18:22:47 +0100652Convert specific annotations by specifying the foundry
653(and optionally the layer with a C<#>-prefix),
654e.g. C<Mate> or C<Mate#Morpho>.
655Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100656
657=item B<--primary|-p>
658
Akronc13a1702016-03-15 19:33:14 +0100659Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100660Can be flagged using C<--no-primary> as well.
661This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100662
663=item B<--jobs|-j>
664
665Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100666for archive processing.
Akron11c80302016-03-18 19:44:43 +0100667Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100668This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100669
Akron35db6e32016-03-17 22:42:22 +0100670=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100671
Akron35db6e32016-03-17 22:42:22 +0100672Define the metadata parser to use. Defaults to C<I5>.
673Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
674This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100675
676=item B<--pretty|-y>
677
Akronc13a1702016-03-15 19:33:14 +0100678Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100679This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100680
681=item B<--gzip|-z>
682
Akronf7ad89e2016-03-16 18:22:47 +0100683Compress the output.
684Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100685
Akron11c80302016-03-18 19:44:43 +0100686=item B<--cache|-c>
687
688File to mmap a cache (using L<Cache::FastMmap>).
689Defaults to C<korapxml2krill.cache> in the calling directory.
690
691=item B<--cache-size|-cs>
692
693Size of the cache. Defaults to C<50m>.
694
695=item B<--cache-init|-ci>
696
697Initialize cache file.
698Can be flagged using C<--no-cache-init> as well.
699Defaults to C<true>.
700
701=item B<--cache-delete|-cd>
702
703Delete cache file after processing.
704Can be flagged using C<--no-cache-delete> as well.
705Defaults to C<true>.
706
Akrone10ad322016-02-27 10:54:26 +0100707=item B<--sigle|-sg>
708
709Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100710Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100711I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200712Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100713
Akron941c1a62016-02-23 17:41:41 +0100714=item B<--log|-l>
715
716The L<Log4perl> log level, defaults to C<ERROR>.
717
718=item B<--help|-h>
719
720Print this document.
721
722=item B<--version|-v>
723
724Print version information.
725
726=back
727
Akronc13a1702016-03-15 19:33:14 +0100728=head1 ANNOTATION SUPPORT
729
730L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
731developed in the KorAP project that are part of the KorAP preprocessing pipeline.
732The base foundry with paragraphs, sentences, and the text element are mandatory for
733L<Krill|https://github.com/KorAP/Krill>.
734
Akronf7ad89e2016-03-16 18:22:47 +0100735=over 2
Akronc13a1702016-03-15 19:33:14 +0100736
737=item B<Base>
738
739=over 4
740
Akronf7ad89e2016-03-16 18:22:47 +0100741=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100742
Akronf7ad89e2016-03-16 18:22:47 +0100743=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100744
745=back
746
747=item B<Connexor>
748
749=over 4
750
Akronf7ad89e2016-03-16 18:22:47 +0100751=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100752
Akronf7ad89e2016-03-16 18:22:47 +0100753=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100754
Akronf7ad89e2016-03-16 18:22:47 +0100755=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100756
Akronf7ad89e2016-03-16 18:22:47 +0100757=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100758
759=back
760
761=item B<CoreNLP>
762
763=over 4
764
Akronf7ad89e2016-03-16 18:22:47 +0100765=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100766
Akronf7ad89e2016-03-16 18:22:47 +0100767=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100768
Akronf7ad89e2016-03-16 18:22:47 +0100769=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100770
Akronf7ad89e2016-03-16 18:22:47 +0100771=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100772
773=back
774
775=item B<DeReKo>
776
777=over 4
778
Akronf7ad89e2016-03-16 18:22:47 +0100779=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100780
781=back
782
783=item B<Glemm>
784
785=over 4
786
Akronf7ad89e2016-03-16 18:22:47 +0100787=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100788
789=back
790
791=item B<Mate>
792
793=over 4
794
Akronf7ad89e2016-03-16 18:22:47 +0100795=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100796
Akronf7ad89e2016-03-16 18:22:47 +0100797=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100798
799=back
800
801=item B<OpenNLP>
802
803=over 4
804
Akronf7ad89e2016-03-16 18:22:47 +0100805=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100806
Akronf7ad89e2016-03-16 18:22:47 +0100807=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100808
809=back
810
811=item B<Sgbr>
812
813=over 4
814
Akronf7ad89e2016-03-16 18:22:47 +0100815=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100816
Akronf7ad89e2016-03-16 18:22:47 +0100817=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100818
819=back
820
821=item B<TreeTagger>
822
823=over 4
824
Akronf7ad89e2016-03-16 18:22:47 +0100825=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100826
Akronf7ad89e2016-03-16 18:22:47 +0100827=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100828
829=back
830
831=item B<XIP>
832
833=over 4
834
Akronf7ad89e2016-03-16 18:22:47 +0100835=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100836
Akronf7ad89e2016-03-16 18:22:47 +0100837=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100838
Akronf7ad89e2016-03-16 18:22:47 +0100839=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100840
841=back
842
843=back
844
845More importers are in preparation.
846New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
847See the built-in annotation importers as examples.
848
Akron941c1a62016-02-23 17:41:41 +0100849=head1 AVAILABILITY
850
851 https://github.com/KorAP/KorAP-XML-Krill
852
853
854=head1 COPYRIGHT AND LICENSE
855
856Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100857
Akron941c1a62016-02-23 17:41:41 +0100858Author: L<Nils Diewald|http://nils-diewald.de/>
859
860L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
861Corpus Analysis Platform at the
862L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
863member of the
864L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
865
866This program is free software published under the
867L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
868
869=cut