blob: 5e9cc3850158b4e55484b3289eb9b470f8b94721 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010017use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010018# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010019# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010020
Akron941c1a62016-02-23 17:41:41 +010021# CHANGES:
22# ----------------------------------------------------------
23# 2013/11/25
24# - Initial release
25#
26# 2014/10/29
27# - Merges foundry data to create indexer friendly documents
28#
Akron93d620e2016-02-05 19:40:05 +010029# 2016/02/04
30# - renamed to korapxml2krill
31# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010032#
33# 2016/02/12
34# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010035# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010036#
37# 2016/02/14
38# - Added version information
Akron941c1a62016-02-23 17:41:41 +010039# - Added support for archive files
40#
41# 2016/02/15
42# - Fixed temporary directory bug
43# - Improved skipping before unzipping
44# - Added EXPERIMENTAL concurrency support
45#
46# 2016/02/23
47# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010048#
49# 2016/02/27
50# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010051#
52# 2016/03/17
53# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010054#
55# 2016/03/18
56# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020057#
Akronf3f0c942016-06-27 13:27:14 +020058# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020059# - Added multi archive support
60# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020061# - Added Malt#Dependency support
Akron941c1a62016-02-23 17:41:41 +010062# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010063
Akron35db6e32016-03-17 22:42:22 +010064our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010065our $LOCAL = $FindBin::Bin;
66our $VERSION_MSG = <<"VERSION";
67Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
68VERSION
69
70
71# Parse comand
72my $cmd;
73our @ARGV;
74if ($ARGV[0] && index($ARGV[0], '-') != 0) {
75 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010076};
Akron93d620e2016-02-05 19:40:05 +010077
Akron08385f62016-03-22 20:37:04 +010078my (@skip, @sigle, @input);
Akron35db6e32016-03-17 22:42:22 +010079my $text;
Akrone10ad322016-02-27 10:54:26 +010080
Akron941c1a62016-02-23 17:41:41 +010081# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000082GetOptions(
Akron08385f62016-03-22 20:37:04 +010083 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010084 'output|o=s' => \(my $output),
85 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010086 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010087 'token|t=s' => \(my $token_base),
88 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010089 'skip|s=s' => \@skip,
90 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010091 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
92 'cache-size|cs=s' => \(my $cache_size = '50m'),
93 'cache-delete|cd!' => \(my $cache_delete = 1),
94 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010095 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010096 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010097 'primary|p!' => \(my $primary),
98 'pretty|y' => \(my $pretty),
99 'jobs|j=i' => \(my $jobs = 0),
100 'help|h' => sub {
101 pod2usage(
102 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
103 -verbose => 99,
104 -msg => $VERSION_MSG,
105 );
106 },
107 'version|v' => sub {
108 pod2usage(
109 -verbose => 0,
110 -msg => $VERSION_MSG
111 )
112 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000113);
114
Akron941c1a62016-02-23 17:41:41 +0100115my %ERROR_HASH = (
116 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
117 -verbose => 99,
118 -msg => $VERSION_MSG,
119 -exit => 1
120);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000121
Akron941c1a62016-02-23 17:41:41 +0100122# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100123pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000124
Nils Diewald7364d1f2013-11-05 19:26:35 +0000125
Akron941c1a62016-02-23 17:41:41 +0100126# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127Log::Log4perl->init({
128 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
129 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
130 'log4perl.appender.STDERR.layout' => 'PatternLayout',
131 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
132});
133
134my $log = Log::Log4perl->get_logger('main');
135
Akron941c1a62016-02-23 17:41:41 +0100136
137# Get file name based on path information
138sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100139 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100140 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200141 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100142 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100143 $file =~ tr/\//-/;
144 $file =~ s{^-+}{};
145 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000146};
147
Akron941c1a62016-02-23 17:41:41 +0100148
149# Write file
150sub write_file {
151 my $anno = shift;
152 my $file = get_file_name $anno;
153
154 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
155
Akron08385f62016-03-22 20:37:04 +0100156 my $call = 'perl ' . $LOCAL . '/korapxml2krill';
157 $call .= ' -i ' . $anno;
158 $call .= ' -o ' . $output . '/' . $file . '.json';
Akron941c1a62016-02-23 17:41:41 +0100159 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100160 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100161 $call .= ' -w' if $overwrite;
162 $call .= ' -t ' . $token_base if $token_base;
163 $call .= ' -l ' . $log_level if $log_level;
Akron11c80302016-03-18 19:44:43 +0100164 $call .= ' -c ' . $cache_file;
165 $call .= ' -cs ' . $cache_size;
166 $call .= ' --no-cache-delete'; # Don't delete the cache
167 $call .= ' --no-cache-init'; # Don't initialize the cache
Akron941c1a62016-02-23 17:41:41 +0100168 $call .= ' --no-primary ' if $primary;
169 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100170 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100171 $call .= ' -s ' . $_ foreach @skip;
172 system($call);
173 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000174};
175
Nils Diewald2db9ad02013-10-29 19:26:43 +0000176
Akrone10ad322016-02-27 10:54:26 +0100177# Convert sigle to path construct
178s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
179
Akron941c1a62016-02-23 17:41:41 +0100180# Process a single file
181unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100182 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000183
Akron941c1a62016-02-23 17:41:41 +0100184 # Can't print gzip to STDOUT
185 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000186
Akron941c1a62016-02-23 17:41:41 +0100187 my %skip;
188 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000189
Akron941c1a62016-02-23 17:41:41 +0100190 # Ignore processing
191 if (!$overwrite && $output && -e $output) {
192 $log->trace($output . ' already exists');
193 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000194 };
Akron941c1a62016-02-23 17:41:41 +0100195
196 BEGIN {
197 $main::TIME = Benchmark->new;
198 $main::LAST_STOP = Benchmark->new;
199 };
200
201 sub stop_time {
202 my $new = Benchmark->new;
203 $log->trace(
204 'The code took: '.
205 timestr(timediff($new, $main::LAST_STOP)) .
206 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
207 );
208 $main::LAST_STOP = $new;
209 };
210
211 # Create and parse new document
212 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100213 my $doc = KorAP::XML::Krill->new(
214 path => $input,
Akron11c80302016-03-18 19:44:43 +0100215 meta_type => ($meta // 'I5'),
216 cache => Cache::FastMmap->new(
217 share_file => $cache_file,
218 cache_size => $cache_size,
219 init_file => $cache_init
220 )
Akron35db6e32016-03-17 22:42:22 +0100221 );
Akron941c1a62016-02-23 17:41:41 +0100222
223 unless ($doc->parse) {
224 $log->warn($output . " can't be processed - no document data");
225 exit(0);
226 };
227
228 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
229 if ($token_base) {
230 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
231 };
232
233 # Get tokenization
234 my $tokens = KorAP::XML::Tokenizer->new(
235 path => $doc->path,
236 doc => $doc,
237 foundry => $token_base_foundry,
238 layer => $token_base_layer,
239 name => 'tokens'
240 );
241
242 # Unable to process base tokenization
243 unless ($tokens->parse) {
244 $log->error($output . " can't be processed - no base tokenization");
245 exit(0);
246 };
247
248 my @layers;
249 push(@layers, ['Base', 'Sentences']);
250 push(@layers, ['Base', 'Paragraphs']);
251
252 # Connexor
253 push(@layers, ['Connexor', 'Morpho']);
254 push(@layers, ['Connexor', 'Syntax']);
255 push(@layers, ['Connexor', 'Phrase']);
256 push(@layers, ['Connexor', 'Sentences']);
257
258 # CoreNLP
259 push(@layers, ['CoreNLP', 'NamedEntities']);
260 push(@layers, ['CoreNLP', 'Sentences']);
261 push(@layers, ['CoreNLP', 'Morpho']);
262 push(@layers, ['CoreNLP', 'Constituency']);
263
264 # DeReKo
265 push(@layers, ['DeReKo', 'Structure']);
266
267 # Glemm
268 push(@layers, ['Glemm', 'Morpho']);
269
270 # Malt
Akronf3f0c942016-06-27 13:27:14 +0200271 push(@layers, ['Malt', 'Dependency']);
Akron941c1a62016-02-23 17:41:41 +0100272
273 # Mate
274 push(@layers, ['Mate', 'Morpho']);
275 push(@layers, ['Mate', 'Dependency']);
276
277 # OpenNLP
278 push(@layers, ['OpenNLP', 'Morpho']);
279 push(@layers, ['OpenNLP', 'Sentences']);
280
281 # Schreibgebrauch
282 push(@layers, ['Sgbr', 'Lemma']);
283 push(@layers, ['Sgbr', 'Morpho']);
284
285 # TreeTagger
286 push(@layers, ['TreeTagger', 'Morpho']);
287 push(@layers, ['TreeTagger', 'Sentences']);
288
289 # XIP
290 push(@layers, ['XIP', 'Morpho']);
291 push(@layers, ['XIP', 'Constituency']);
292 push(@layers, ['XIP', 'Sentences']);
293 push(@layers, ['XIP', 'Dependency']);
294
295
296 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100297 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100298 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000299 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000300 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000301 }
302 else {
Akron941c1a62016-02-23 17:41:41 +0100303 # Add to index file - respect skipping
304 foreach my $info (@layers) {
305 # Skip if Foundry or Foundry#Layer should be skipped
306 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
307 $tokens->add(@$info);
308 stop_time;
309 };
310 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000311 };
312
Akron941c1a62016-02-23 17:41:41 +0100313 my $file;
Akron35db6e32016-03-17 22:42:22 +0100314 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100315
316 if ($output) {
317
318 if ($gzip) {
319 $file = IO::Compress::Gzip->new($output, Minimal => 1);
320 }
321 else {
322 $file = IO::File->new($output, "w");
323 };
324
325 $file->print($print_text);
326 $file->close;
327 }
328
329 else {
330 print $print_text . "\n";
331 };
332
Akron11c80302016-03-18 19:44:43 +0100333 # Delete cache file
334 unlink($cache_file) if $cache_delete;
335
Akron941c1a62016-02-23 17:41:41 +0100336 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000337}
Nils Diewald59094f22014-11-05 18:20:50 +0000338
Akrone10ad322016-02-27 10:54:26 +0100339# Extract XML files
340elsif ($cmd eq 'extract') {
341
342 pod2usage(%ERROR_HASH) unless $output;
343
Akrone10ad322016-02-27 10:54:26 +0100344 if ($output && (!-e $output || !-d $output)) {
345 print "Directory '$output' does not exist.\n\n";
346 exit(0);
347 };
348
Akronb0c88db2016-06-29 16:33:18 +0200349 # TODO: Support sigles and full archives
Akron08385f62016-03-22 20:37:04 +0100350
Akronb0c88db2016-06-29 16:33:18 +0200351 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100352
353 unless ($archive->test_unzip) {
354 print "Unzip is not installed or incompatible.\n\n";
355 exit(1);
356 };
357
Akronb0c88db2016-06-29 16:33:18 +0200358 # Add further annotation archived
359 $archive->attach($_) foreach @input;
360
Akrone10ad322016-02-27 10:54:26 +0100361 # Iterate over all given sigles and extract
362 foreach (@sigle) {
363 print "$_ ";
Akronb0c88db2016-06-29 16:33:18 +0200364# print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
365 print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
Akrone10ad322016-02-27 10:54:26 +0100366 print "extracted.\n";
367 };
368
369 print "\n";
370 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200371 }
372 else {
373 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100374 };
375}
376
Akron941c1a62016-02-23 17:41:41 +0100377# Process an archive
378elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000379
Akrone10ad322016-02-27 10:54:26 +0100380 # TODO: Support sigles
381
Akron941c1a62016-02-23 17:41:41 +0100382 pod2usage(%ERROR_HASH) unless $output;
383
384 if ($output && (!-e $output || !-d $output)) {
385 print "Directory '$output' does not exist.\n\n";
386 exit(0);
387 };
388
389 # Zero means: everything runs in the parent process
390 my $pool = Parallel::ForkManager->new($jobs);
391
392 my $count = 0; # Texts to process
393 my $iter = 1; # Current text in process
394
395 # Report on fork message
396 $pool->run_on_finish (
397 sub {
398 my ($pid, $code) = shift;
399 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100400 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100401 ($iter++) . "/$count]" .
402 ($code ? " $code" : '') .
403 " $$data\n";
404 }
405 );
406
407 my $t;
408 print "Reading data ...\n";
409
Akron11c80302016-03-18 19:44:43 +0100410 unless (Cache::FastMmap->new(
411 share_file => $cache_file,
412 cache_size => $cache_size,
413 init_file => $cache_init
414 )) {
415 print "Unable to intialize cache '$cache_file'\n\n";
416 exit(1);
417 };
418
Akron941c1a62016-02-23 17:41:41 +0100419 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100420 if (-d $input[0]) {
421 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100422 my @dirs;
423 my $dir;
424
425 while (1) {
426 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
427 push @dirs, $dir;
428 $it->prune;
429 };
430 last unless $it->next;
431 };
432
433 print "Start processing ...\n";
434 $t = Benchmark->new;
435 $count = scalar @dirs;
436
437 DIRECTORY_LOOP:
438 for (my $i = 0; $i < $count; $i++) {
439
440 unless ($overwrite) {
441 my $filename = catfile(
442 $output,
443 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
444 );
445
446 if (-e $filename) {
447 $iter++;
448 print "Skip $filename\n";
449 next;
450 };
451 };
452
453 # Get the next fork
454 my $pid = $pool->start and next DIRECTORY_LOOP;
455 my $msg;
456
457 $msg = write_file($dirs[$i]);
458 $pool->finish(0, \$msg);
459 };
460 }
461
462 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200463 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron941c1a62016-02-23 17:41:41 +0100464 unless ($archive->test_unzip) {
465 print "Unzip is not installed or incompatible.\n\n";
466 exit(1);
467 };
468
Akron08385f62016-03-22 20:37:04 +0100469 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200470 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100471
Akron941c1a62016-02-23 17:41:41 +0100472 print "Start processing ...\n";
473 $t = Benchmark->new;
474 my @dirs = $archive->list_texts;
475 $count = scalar @dirs;
476
477 ARCHIVE_LOOP:
478 for (my $i = 0; $i < $count; $i++) {
479
480 # Split path information
481 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
482
483 unless ($overwrite) {
Akron62557602016-06-27 14:10:13 +0200484
485 # This is not correct!!
Akron941c1a62016-02-23 17:41:41 +0100486 my $filename = catfile(
487 $output,
Akron62557602016-06-27 14:10:13 +0200488 get_file_name(
489 catfile($corpus, $doc, $text)
490 . '.json' . ($gzip ? '.gz' : '')
491 )
Akron941c1a62016-02-23 17:41:41 +0100492 );
493
494 if (-e $filename) {
495 $iter++;
496 print "Skip $filename\n";
497 next;
498 };
499 };
500
501 # Get the next fork
502 my $pid = $pool->start and next ARCHIVE_LOOP;
503
504 # Create temporary file
505 my $temp = File::Temp->newdir;
506
507 my $msg;
508
509 # Extract from archive
510 if ($archive->extract($dirs[$i], $temp)) {
511
512 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100513 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100514
515 # Temporary directory
516 my $dir = catdir($input, $doc, $text);
517
518 # Write file
519 $msg = write_file($dir);
520
521 $temp = undef;
522 $pool->finish(0, \$msg);
523 }
524 else {
525
526 $temp = undef;
527 $msg = "Unable to extract " . $dirs[$i] . "\n";
528 $pool->finish(1, \$msg);
529 };
530 };
531 }
532
533 else {
534 print "Input is neither a directory nor an archive.\n\n";
535 };
536
537 $pool->wait_all_children;
538
Akron11c80302016-03-18 19:44:43 +0100539 # Delete cache file
540 unlink($cache_file) if $cache_delete;
541
Akron941c1a62016-02-23 17:41:41 +0100542 print "Done.\n";
543 print timestr(timediff(Benchmark->new, $t))."\n\n";
544}
545
546# Unknown command
547else {
548 warn "Unknown command '$cmd'.\n\n";
549 pod2usage(%ERROR_HASH);
550}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000551
552__END__
Akron941c1a62016-02-23 17:41:41 +0100553
554=pod
555
556=encoding utf8
557
558=head1 NAME
559
Akronf7ad89e2016-03-16 18:22:47 +0100560korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100561
562
563=head1 SYNOPSIS
564
Akronc13a1702016-03-15 19:33:14 +0100565 $ korapxml2krill -z --input <directory> --output <filename>
566 $ korapxml2krill archive -z --input <directory> --output <directory>
567 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100568
569
570=head1 DESCRIPTION
571
572L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
573compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100574The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100575
576
577=head1 INSTALLATION
578
579The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
580
581 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
582
Akronc13a1702016-03-15 19:33:14 +0100583In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100584be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100585
586
587=head1 ARGUMENTS
588
589=over 2
590
591=item B<archive>
592
Akrone10ad322016-02-27 10:54:26 +0100593Process an archive as a Zip-file or a folder of KorAP-XML documents.
594
595=item B<extract>
596
597Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100598
599=back
600
601
602=head1 OPTIONS
603
604=over 2
605
Akron2cfe8092016-06-24 17:48:49 +0200606=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100607
Akronf7ad89e2016-03-16 18:22:47 +0100608Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100609
Akron0c3e3752016-06-28 15:55:53 +0200610Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200611that the first archive listed contains all primary data files
612and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200613
614 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
615
Akron0c3e3752016-06-28 15:55:53 +0200616(The directory structure follows the base directory format,
617that may include a C<.> root folder.
618In this case further archives lacking a C<.> root folder
619need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200620
Akron941c1a62016-02-23 17:41:41 +0100621=item B<--output|-o> <directory|file>
622
623Output folder for archive processing or
624document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100625writes to C<STDOUT> by default
626(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100627
628=item B<--overwrite|-w>
629
630Overwrite files that already exist.
631
632=item B<--token|-t> <foundry>[#<file>]
633
634Define the default tokenization by specifying
635the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100636of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100637
638=item B<--skip|-s> <foundry>[#<layer>]
639
Akronf7ad89e2016-03-16 18:22:47 +0100640Skip specific annotations by specifying the foundry
641(and optionally the layer with a C<#>-prefix),
642e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100643Can be set multiple times.
644
Akronc13a1702016-03-15 19:33:14 +0100645=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100646
Akronf7ad89e2016-03-16 18:22:47 +0100647Convert specific annotations by specifying the foundry
648(and optionally the layer with a C<#>-prefix),
649e.g. C<Mate> or C<Mate#Morpho>.
650Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100651
652=item B<--primary|-p>
653
Akronc13a1702016-03-15 19:33:14 +0100654Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100655Can be flagged using C<--no-primary> as well.
656This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100657
658=item B<--jobs|-j>
659
660Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100661for archive processing.
Akron11c80302016-03-18 19:44:43 +0100662Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100663This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100664
Akron35db6e32016-03-17 22:42:22 +0100665=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100666
Akron35db6e32016-03-17 22:42:22 +0100667Define the metadata parser to use. Defaults to C<I5>.
668Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
669This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100670
671=item B<--pretty|-y>
672
Akronc13a1702016-03-15 19:33:14 +0100673Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100674This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100675
676=item B<--gzip|-z>
677
Akronf7ad89e2016-03-16 18:22:47 +0100678Compress the output.
679Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100680
Akron11c80302016-03-18 19:44:43 +0100681=item B<--cache|-c>
682
683File to mmap a cache (using L<Cache::FastMmap>).
684Defaults to C<korapxml2krill.cache> in the calling directory.
685
686=item B<--cache-size|-cs>
687
688Size of the cache. Defaults to C<50m>.
689
690=item B<--cache-init|-ci>
691
692Initialize cache file.
693Can be flagged using C<--no-cache-init> as well.
694Defaults to C<true>.
695
696=item B<--cache-delete|-cd>
697
698Delete cache file after processing.
699Can be flagged using C<--no-cache-delete> as well.
700Defaults to C<true>.
701
Akrone10ad322016-02-27 10:54:26 +0100702=item B<--sigle|-sg>
703
704Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100705Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100706I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200707Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100708
Akron941c1a62016-02-23 17:41:41 +0100709=item B<--log|-l>
710
711The L<Log4perl> log level, defaults to C<ERROR>.
712
713=item B<--help|-h>
714
715Print this document.
716
717=item B<--version|-v>
718
719Print version information.
720
721=back
722
Akronc13a1702016-03-15 19:33:14 +0100723=head1 ANNOTATION SUPPORT
724
725L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
726developed in the KorAP project that are part of the KorAP preprocessing pipeline.
727The base foundry with paragraphs, sentences, and the text element are mandatory for
728L<Krill|https://github.com/KorAP/Krill>.
729
Akronf7ad89e2016-03-16 18:22:47 +0100730=over 2
Akronc13a1702016-03-15 19:33:14 +0100731
732=item B<Base>
733
734=over 4
735
Akronf7ad89e2016-03-16 18:22:47 +0100736=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100737
Akronf7ad89e2016-03-16 18:22:47 +0100738=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100739
740=back
741
742=item B<Connexor>
743
744=over 4
745
Akronf7ad89e2016-03-16 18:22:47 +0100746=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100747
Akronf7ad89e2016-03-16 18:22:47 +0100748=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100749
Akronf7ad89e2016-03-16 18:22:47 +0100750=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100751
Akronf7ad89e2016-03-16 18:22:47 +0100752=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100753
754=back
755
756=item B<CoreNLP>
757
758=over 4
759
Akronf7ad89e2016-03-16 18:22:47 +0100760=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100761
Akronf7ad89e2016-03-16 18:22:47 +0100762=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100763
Akronf7ad89e2016-03-16 18:22:47 +0100764=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100765
Akronf7ad89e2016-03-16 18:22:47 +0100766=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100767
768=back
769
770=item B<DeReKo>
771
772=over 4
773
Akronf7ad89e2016-03-16 18:22:47 +0100774=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100775
776=back
777
778=item B<Glemm>
779
780=over 4
781
Akronf7ad89e2016-03-16 18:22:47 +0100782=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100783
784=back
785
786=item B<Mate>
787
788=over 4
789
Akronf7ad89e2016-03-16 18:22:47 +0100790=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100791
Akronf7ad89e2016-03-16 18:22:47 +0100792=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100793
794=back
795
796=item B<OpenNLP>
797
798=over 4
799
Akronf7ad89e2016-03-16 18:22:47 +0100800=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100801
Akronf7ad89e2016-03-16 18:22:47 +0100802=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100803
804=back
805
806=item B<Sgbr>
807
808=over 4
809
Akronf7ad89e2016-03-16 18:22:47 +0100810=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100811
Akronf7ad89e2016-03-16 18:22:47 +0100812=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100813
814=back
815
816=item B<TreeTagger>
817
818=over 4
819
Akronf7ad89e2016-03-16 18:22:47 +0100820=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100821
Akronf7ad89e2016-03-16 18:22:47 +0100822=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100823
824=back
825
826=item B<XIP>
827
828=over 4
829
Akronf7ad89e2016-03-16 18:22:47 +0100830=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100831
Akronf7ad89e2016-03-16 18:22:47 +0100832=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100833
Akronf7ad89e2016-03-16 18:22:47 +0100834=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100835
836=back
837
838=back
839
840More importers are in preparation.
841New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
842See the built-in annotation importers as examples.
843
Akron941c1a62016-02-23 17:41:41 +0100844=head1 AVAILABILITY
845
846 https://github.com/KorAP/KorAP-XML-Krill
847
848
849=head1 COPYRIGHT AND LICENSE
850
851Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100852
Akron941c1a62016-02-23 17:41:41 +0100853Author: L<Nils Diewald|http://nils-diewald.de/>
854
855L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
856Corpus Analysis Platform at the
857L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
858member of the
859L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
860
861This program is free software published under the
862L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
863
864=cut