blob: 250c68dd993262d759532db5b35b7c4ac8eca516 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron941c1a62016-02-23 17:41:41 +010066# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010067
Akron5f51d422016-08-16 16:26:43 +020068our $LAST_CHANGE = '2016/08/16';
Akron941c1a62016-02-23 17:41:41 +010069our $LOCAL = $FindBin::Bin;
70our $VERSION_MSG = <<"VERSION";
71Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
72VERSION
73
Akron941c1a62016-02-23 17:41:41 +010074# Parse comand
75my $cmd;
76our @ARGV;
77if ($ARGV[0] && index($ARGV[0], '-') != 0) {
78 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010079};
Akron93d620e2016-02-05 19:40:05 +010080
Akron5f51d422016-08-16 16:26:43 +020081my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010082my $text;
Akrone10ad322016-02-27 10:54:26 +010083
Akron941c1a62016-02-23 17:41:41 +010084# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000085GetOptions(
Akron08385f62016-03-22 20:37:04 +010086 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010087 'output|o=s' => \(my $output),
88 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010089 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010090 'token|t=s' => \(my $token_base),
91 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010092 'skip|s=s' => \@skip,
93 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010094 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
95 'cache-size|cs=s' => \(my $cache_size = '50m'),
96 'cache-delete|cd!' => \(my $cache_delete = 1),
97 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010098 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +020099 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100100 'primary|p!' => \(my $primary),
101 'pretty|y' => \(my $pretty),
102 'jobs|j=i' => \(my $jobs = 0),
103 'help|h' => sub {
104 pod2usage(
105 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
106 -verbose => 99,
107 -msg => $VERSION_MSG,
Akrone2b902d2016-08-16 16:50:11 +0200108 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100109 );
110 },
111 'version|v' => sub {
112 pod2usage(
113 -verbose => 0,
Akrone2b902d2016-08-16 16:50:11 +0200114 -msg => $VERSION_MSG,
115 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100116 )
117 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000118);
119
Akron941c1a62016-02-23 17:41:41 +0100120my %ERROR_HASH = (
121 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
122 -verbose => 99,
123 -msg => $VERSION_MSG,
Akrone2b902d2016-08-16 16:50:11 +0200124 -output => '-',
Akron941c1a62016-02-23 17:41:41 +0100125 -exit => 1
126);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127
Akron941c1a62016-02-23 17:41:41 +0100128# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100129pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000130
Akrone1dbc382016-07-08 22:24:52 +0200131# Gzip has no effect, if no output is given
132pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000133
Akron941c1a62016-02-23 17:41:41 +0100134# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000135Log::Log4perl->init({
136 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
137 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
138 'log4perl.appender.STDERR.layout' => 'PatternLayout',
139 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
140});
141
142my $log = Log::Log4perl->get_logger('main');
143
Akrone1dbc382016-07-08 22:24:52 +0200144my %skip;
145$skip{lc($_)} = 1 foreach @skip;
146
147my @layers;
148push(@layers, ['Base', 'Sentences']);
149push(@layers, ['Base', 'Paragraphs']);
150
151# Connexor
152push(@layers, ['Connexor', 'Morpho']);
153push(@layers, ['Connexor', 'Syntax']);
154push(@layers, ['Connexor', 'Phrase']);
155push(@layers, ['Connexor', 'Sentences']);
156
157# CoreNLP
158push(@layers, ['CoreNLP', 'NamedEntities']);
159push(@layers, ['CoreNLP', 'Sentences']);
160push(@layers, ['CoreNLP', 'Morpho']);
161push(@layers, ['CoreNLP', 'Constituency']);
162
163# DeReKo
164push(@layers, ['DeReKo', 'Structure']);
165
166# Glemm
167push(@layers, ['Glemm', 'Morpho']);
168
169# Malt
170push(@layers, ['Malt', 'Dependency']);
171
172# MDParser
173push(@layers, ['MDParser', 'Dependency']);
174
175# Mate
176push(@layers, ['Mate', 'Morpho']);
177push(@layers, ['Mate', 'Dependency']);
178
179# OpenNLP
180push(@layers, ['OpenNLP', 'Morpho']);
181push(@layers, ['OpenNLP', 'Sentences']);
182
183# Schreibgebrauch
184push(@layers, ['Sgbr', 'Lemma']);
185push(@layers, ['Sgbr', 'Morpho']);
186
187# TreeTagger
188push(@layers, ['TreeTagger', 'Morpho']);
189push(@layers, ['TreeTagger', 'Sentences']);
190
191# XIP
192push(@layers, ['XIP', 'Morpho']);
193push(@layers, ['XIP', 'Constituency']);
194push(@layers, ['XIP', 'Sentences']);
195push(@layers, ['XIP', 'Dependency']);
196
197# Check filters
198my @filtered_anno;
199if ($skip{'#all'}) {
200 foreach (@anno) {
201 push @filtered_anno, [ split('#', $_) ];
202 };
203}
204
205# Add all annotations that are not skipped
206else {
207 # Add to index file - respect skipping
208 foreach my $info (@layers) {
209 # Skip if Foundry or Foundry#Layer should be skipped
210 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
211 push @filtered_anno, $info;
212 };
213 };
214};
215
216# Get tokenization basis
217my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
218
219# TODO: This should not be initialized for batch
220my $cache = Cache::FastMmap->new(
221 share_file => $cache_file,
222 cache_size => $cache_size,
223 init_file => $cache_init
224);
225
226my $batch_file = KorAP::XML::Batch::File->new(
227 cache => $cache,
228 meta_type => $meta,
229 overwrite => $overwrite,
230 foundry => $token_base_foundry,
231 layer => $token_base_layer,
232 gzip => $gzip,
233 log => $log,
234 primary => $primary,
235 pretty => $pretty,
236 anno => \@filtered_anno
237);
238
Akron941c1a62016-02-23 17:41:41 +0100239
240# Get file name based on path information
241sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100242 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100243 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200244 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100245 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100246 $file =~ tr/\//-/;
247 $file =~ s{^-+}{};
248 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000249};
250
Akron941c1a62016-02-23 17:41:41 +0100251
252# Write file
Akrone1dbc382016-07-08 22:24:52 +0200253#sub write_file {
254# my $anno = shift;
255# my $file = get_file_name $anno;
256#
257# # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
258#
259# my $call = 'perl ' . $LOCAL . '/korapxml2krill';
260# $call .= ' -i ' . $anno;
261# $call .= ' -o ' . $output . '/' . $file . '.json';
262# $call .= '.gz -z' if $gzip;
263# $call .= ' -m ' . $meta if $meta;
264# $call .= ' -w' if $overwrite;
265# $call .= ' -t ' . $token_base if $token_base;
266# $call .= ' -l ' . $log_level if $log_level;
267# $call .= ' -c ' . $cache_file;
268# $call .= ' -cs ' . $cache_size;
269# $call .= ' --no-cache-delete'; # Don't delete the cache
270# $call .= ' --no-cache-init'; # Don't initialize the cache
271# $call .= ' --no-primary ' if $primary;
272# $call .= ' -y ' . $pretty if $pretty;
273# $call .= ' -a ' . $_ foreach @anno;
274# $call .= ' -s ' . $_ foreach @skip;
275# system($call);
276# return "$file";
277#};
Nils Diewald7364d1f2013-11-05 19:26:35 +0000278
Nils Diewald2db9ad02013-10-29 19:26:43 +0000279
Akrone10ad322016-02-27 10:54:26 +0100280# Convert sigle to path construct
281s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
282
Akron941c1a62016-02-23 17:41:41 +0100283# Process a single file
284unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100285 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000286
Akron941c1a62016-02-23 17:41:41 +0100287 BEGIN {
288 $main::TIME = Benchmark->new;
289 $main::LAST_STOP = Benchmark->new;
290 };
291
292 sub stop_time {
293 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200294 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100295 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200296 timestr(timediff($new, $main::LAST_STOP)) .
297 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
298 );
Akron941c1a62016-02-23 17:41:41 +0100299 $main::LAST_STOP = $new;
300 };
301
302 # Create and parse new document
303 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100304
Akrone1dbc382016-07-08 22:24:52 +0200305 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100306
Akron11c80302016-03-18 19:44:43 +0100307 # Delete cache file
308 unlink($cache_file) if $cache_delete;
309
Akron5f51d422016-08-16 16:26:43 +0200310 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000311}
Nils Diewald59094f22014-11-05 18:20:50 +0000312
Akrone10ad322016-02-27 10:54:26 +0100313# Extract XML files
314elsif ($cmd eq 'extract') {
315
Akrone1dbc382016-07-08 22:24:52 +0200316warn '!!!!!!!!!!!!!------------> ';
Akrone10ad322016-02-27 10:54:26 +0100317
Akrone1dbc382016-07-08 22:24:52 +0200318if ($output && (!-e $output || !-d $output)) {
319 print "Directory '$output' does not exist.\n\n";
320 exit(0);
321};
322
Akrone10ad322016-02-27 10:54:26 +0100323
Akronb0c88db2016-06-29 16:33:18 +0200324 # TODO: Support sigles and full archives
Akron08385f62016-03-22 20:37:04 +0100325
Akronb0c88db2016-06-29 16:33:18 +0200326 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100327
328 unless ($archive->test_unzip) {
329 print "Unzip is not installed or incompatible.\n\n";
330 exit(1);
331 };
332
Akronb0c88db2016-06-29 16:33:18 +0200333 # Add further annotation archived
334 $archive->attach($_) foreach @input;
335
Akrone10ad322016-02-27 10:54:26 +0100336 # Iterate over all given sigles and extract
337 foreach (@sigle) {
338 print "$_ ";
Akronb0c88db2016-06-29 16:33:18 +0200339 print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
Akrone10ad322016-02-27 10:54:26 +0100340 print "extracted.\n";
341 };
342
343 print "\n";
344 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200345 }
346 else {
347 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100348 };
349}
350
Akron941c1a62016-02-23 17:41:41 +0100351# Process an archive
352elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000353
Akrone1dbc382016-07-08 22:24:52 +0200354warn '!!!!!!!!!!!!!------------> ';
Akrone10ad322016-02-27 10:54:26 +0100355
Akrone1dbc382016-07-08 22:24:52 +0200356if ($output && (!-e $output || !-d $output)) {
357 print "Directory '$output' does not exist.\n\n";
358 exit(0);
359};
360
361
362 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100363
364 if ($output && (!-e $output || !-d $output)) {
365 print "Directory '$output' does not exist.\n\n";
366 exit(0);
367 };
368
369 # Zero means: everything runs in the parent process
370 my $pool = Parallel::ForkManager->new($jobs);
371
372 my $count = 0; # Texts to process
373 my $iter = 1; # Current text in process
374
375 # Report on fork message
376 $pool->run_on_finish (
377 sub {
378 my ($pid, $code) = shift;
379 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100380 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100381 ($iter++) . "/$count]" .
382 ($code ? " $code" : '') .
383 " $$data\n";
384 }
385 );
386
387 my $t;
388 print "Reading data ...\n";
389
Akrone1dbc382016-07-08 22:24:52 +0200390# unless (Cache::FastMmap->new(
391# share_file => $cache_file,
392# cache_size => $cache_size,
393# init_file => $cache_init
394# )) {
395# print "Unable to intialize cache '$cache_file'\n\n";
396# exit(1);
397# };
Akron11c80302016-03-18 19:44:43 +0100398
Akron941c1a62016-02-23 17:41:41 +0100399 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100400 if (-d $input[0]) {
401 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100402 my @dirs;
403 my $dir;
404
405 while (1) {
406 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
407 push @dirs, $dir;
408 $it->prune;
409 };
410 last unless $it->next;
411 };
412
413 print "Start processing ...\n";
414 $t = Benchmark->new;
415 $count = scalar @dirs;
416
417 DIRECTORY_LOOP:
418 for (my $i = 0; $i < $count; $i++) {
419
Akrone1dbc382016-07-08 22:24:52 +0200420 my $filename = catfile(
421 $output,
422 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
423 );
Akron941c1a62016-02-23 17:41:41 +0100424
425 # Get the next fork
426 my $pid = $pool->start and next DIRECTORY_LOOP;
427 my $msg;
428
Akrone1dbc382016-07-08 22:24:52 +0200429 $msg = $batch_file->process($dirs[$i] => $filename);
Akron941c1a62016-02-23 17:41:41 +0100430 $pool->finish(0, \$msg);
431 };
432 }
433
434 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200435 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200436
Akron941c1a62016-02-23 17:41:41 +0100437 unless ($archive->test_unzip) {
438 print "Unzip is not installed or incompatible.\n\n";
439 exit(1);
440 };
441
Akron08385f62016-03-22 20:37:04 +0100442 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200443 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100444
Akron941c1a62016-02-23 17:41:41 +0100445 print "Start processing ...\n";
446 $t = Benchmark->new;
447 my @dirs = $archive->list_texts;
448 $count = scalar @dirs;
449
450 ARCHIVE_LOOP:
451 for (my $i = 0; $i < $count; $i++) {
452
453 # Split path information
454 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
455
Akrone1dbc382016-07-08 22:24:52 +0200456 my $filename = catfile(
457 $output,
458 get_file_name(
459 catfile($corpus, $doc, $text)
460 . '.json' . ($gzip ? '.gz' : '')
461 )
462 );
Akron941c1a62016-02-23 17:41:41 +0100463
464 # Get the next fork
465 my $pid = $pool->start and next ARCHIVE_LOOP;
466
467 # Create temporary file
468 my $temp = File::Temp->newdir;
469
470 my $msg;
471
472 # Extract from archive
473 if ($archive->extract($dirs[$i], $temp)) {
474
475 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100476 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100477
478 # Temporary directory
479 my $dir = catdir($input, $doc, $text);
480
481 # Write file
Akrone1dbc382016-07-08 22:24:52 +0200482 $msg = $batch_file->process($dir => $output);
Akron941c1a62016-02-23 17:41:41 +0100483
484 $temp = undef;
485 $pool->finish(0, \$msg);
486 }
487 else {
488
489 $temp = undef;
490 $msg = "Unable to extract " . $dirs[$i] . "\n";
491 $pool->finish(1, \$msg);
492 };
493 };
494 }
495
496 else {
497 print "Input is neither a directory nor an archive.\n\n";
498 };
499
500 $pool->wait_all_children;
501
Akron11c80302016-03-18 19:44:43 +0100502 # Delete cache file
503 unlink($cache_file) if $cache_delete;
504
Akron941c1a62016-02-23 17:41:41 +0100505 print "Done.\n";
506 print timestr(timediff(Benchmark->new, $t))."\n\n";
507}
508
509# Unknown command
510else {
511 warn "Unknown command '$cmd'.\n\n";
512 pod2usage(%ERROR_HASH);
513}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000514
515__END__
Akron941c1a62016-02-23 17:41:41 +0100516
517=pod
518
519=encoding utf8
520
521=head1 NAME
522
Akronf7ad89e2016-03-16 18:22:47 +0100523korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100524
525
526=head1 SYNOPSIS
527
Akronc13a1702016-03-15 19:33:14 +0100528 $ korapxml2krill -z --input <directory> --output <filename>
529 $ korapxml2krill archive -z --input <directory> --output <directory>
530 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100531
532
533=head1 DESCRIPTION
534
535L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
536compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100537The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100538
539
540=head1 INSTALLATION
541
542The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
543
544 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
545
Akronc13a1702016-03-15 19:33:14 +0100546In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100547be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100548
549
550=head1 ARGUMENTS
551
552=over 2
553
554=item B<archive>
555
Akrone10ad322016-02-27 10:54:26 +0100556Process an archive as a Zip-file or a folder of KorAP-XML documents.
557
558=item B<extract>
559
560Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100561
562=back
563
564
565=head1 OPTIONS
566
567=over 2
568
Akron2cfe8092016-06-24 17:48:49 +0200569=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100570
Akronf7ad89e2016-03-16 18:22:47 +0100571Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100572
Akron0c3e3752016-06-28 15:55:53 +0200573Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200574that the first archive listed contains all primary data files
575and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200576
577 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
578
Akron0c3e3752016-06-28 15:55:53 +0200579(The directory structure follows the base directory format,
580that may include a C<.> root folder.
581In this case further archives lacking a C<.> root folder
582need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200583
Akron941c1a62016-02-23 17:41:41 +0100584=item B<--output|-o> <directory|file>
585
586Output folder for archive processing or
587document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100588writes to C<STDOUT> by default
589(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100590
591=item B<--overwrite|-w>
592
593Overwrite files that already exist.
594
595=item B<--token|-t> <foundry>[#<file>]
596
597Define the default tokenization by specifying
598the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100599of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100600
601=item B<--skip|-s> <foundry>[#<layer>]
602
Akronf7ad89e2016-03-16 18:22:47 +0100603Skip specific annotations by specifying the foundry
604(and optionally the layer with a C<#>-prefix),
605e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100606Can be set multiple times.
607
Akronc13a1702016-03-15 19:33:14 +0100608=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100609
Akronf7ad89e2016-03-16 18:22:47 +0100610Convert specific annotations by specifying the foundry
611(and optionally the layer with a C<#>-prefix),
612e.g. C<Mate> or C<Mate#Morpho>.
613Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100614
615=item B<--primary|-p>
616
Akronc13a1702016-03-15 19:33:14 +0100617Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100618Can be flagged using C<--no-primary> as well.
619This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100620
621=item B<--jobs|-j>
622
623Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100624for archive processing.
Akron11c80302016-03-18 19:44:43 +0100625Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100626This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100627
Akron35db6e32016-03-17 22:42:22 +0100628=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100629
Akron35db6e32016-03-17 22:42:22 +0100630Define the metadata parser to use. Defaults to C<I5>.
631Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
632This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100633
634=item B<--pretty|-y>
635
Akronc13a1702016-03-15 19:33:14 +0100636Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100637This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100638
639=item B<--gzip|-z>
640
Akronf7ad89e2016-03-16 18:22:47 +0100641Compress the output.
642Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100643
Akron11c80302016-03-18 19:44:43 +0100644=item B<--cache|-c>
645
646File to mmap a cache (using L<Cache::FastMmap>).
647Defaults to C<korapxml2krill.cache> in the calling directory.
648
649=item B<--cache-size|-cs>
650
651Size of the cache. Defaults to C<50m>.
652
653=item B<--cache-init|-ci>
654
655Initialize cache file.
656Can be flagged using C<--no-cache-init> as well.
657Defaults to C<true>.
658
659=item B<--cache-delete|-cd>
660
661Delete cache file after processing.
662Can be flagged using C<--no-cache-delete> as well.
663Defaults to C<true>.
664
Akrone10ad322016-02-27 10:54:26 +0100665=item B<--sigle|-sg>
666
667Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100668Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100669I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200670Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100671
Akron941c1a62016-02-23 17:41:41 +0100672=item B<--log|-l>
673
674The L<Log4perl> log level, defaults to C<ERROR>.
675
676=item B<--help|-h>
677
678Print this document.
679
680=item B<--version|-v>
681
682Print version information.
683
684=back
685
Akronc13a1702016-03-15 19:33:14 +0100686=head1 ANNOTATION SUPPORT
687
688L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
689developed in the KorAP project that are part of the KorAP preprocessing pipeline.
690The base foundry with paragraphs, sentences, and the text element are mandatory for
691L<Krill|https://github.com/KorAP/Krill>.
692
Akronf7ad89e2016-03-16 18:22:47 +0100693=over 2
Akronc13a1702016-03-15 19:33:14 +0100694
695=item B<Base>
696
697=over 4
698
Akronf7ad89e2016-03-16 18:22:47 +0100699=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100700
Akronf7ad89e2016-03-16 18:22:47 +0100701=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100702
703=back
704
705=item B<Connexor>
706
707=over 4
708
Akronf7ad89e2016-03-16 18:22:47 +0100709=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100710
Akronf7ad89e2016-03-16 18:22:47 +0100711=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100712
Akronf7ad89e2016-03-16 18:22:47 +0100713=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100714
Akronf7ad89e2016-03-16 18:22:47 +0100715=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100716
717=back
718
719=item B<CoreNLP>
720
721=over 4
722
Akronf7ad89e2016-03-16 18:22:47 +0100723=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100724
Akronf7ad89e2016-03-16 18:22:47 +0100725=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100726
Akronf7ad89e2016-03-16 18:22:47 +0100727=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100728
Akronf7ad89e2016-03-16 18:22:47 +0100729=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100730
731=back
732
733=item B<DeReKo>
734
735=over 4
736
Akronf7ad89e2016-03-16 18:22:47 +0100737=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100738
739=back
740
741=item B<Glemm>
742
743=over 4
744
Akronf7ad89e2016-03-16 18:22:47 +0100745=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100746
747=back
748
749=item B<Mate>
750
751=over 4
752
Akronf7ad89e2016-03-16 18:22:47 +0100753=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100754
Akronf7ad89e2016-03-16 18:22:47 +0100755=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100756
757=back
758
759=item B<OpenNLP>
760
761=over 4
762
Akronf7ad89e2016-03-16 18:22:47 +0100763=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100764
Akronf7ad89e2016-03-16 18:22:47 +0100765=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100766
767=back
768
769=item B<Sgbr>
770
771=over 4
772
Akronf7ad89e2016-03-16 18:22:47 +0100773=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100774
Akronf7ad89e2016-03-16 18:22:47 +0100775=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100776
777=back
778
779=item B<TreeTagger>
780
781=over 4
782
Akronf7ad89e2016-03-16 18:22:47 +0100783=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100784
Akronf7ad89e2016-03-16 18:22:47 +0100785=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100786
787=back
788
789=item B<XIP>
790
791=over 4
792
Akronf7ad89e2016-03-16 18:22:47 +0100793=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100794
Akronf7ad89e2016-03-16 18:22:47 +0100795=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100796
Akronf7ad89e2016-03-16 18:22:47 +0100797=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100798
799=back
800
801=back
802
803More importers are in preparation.
804New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
805See the built-in annotation importers as examples.
806
Akron941c1a62016-02-23 17:41:41 +0100807=head1 AVAILABILITY
808
809 https://github.com/KorAP/KorAP-XML-Krill
810
811
812=head1 COPYRIGHT AND LICENSE
813
814Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100815
Akron941c1a62016-02-23 17:41:41 +0100816Author: L<Nils Diewald|http://nils-diewald.de/>
817
818L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
819Corpus Analysis Platform at the
820L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
821member of the
822L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
823
824This program is free software published under the
825L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
826
827=cut