blob: c5db74283c110833b0f167403fc7477705b6829f [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
12use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010013use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010014use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010016use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010017# TODO: use Parallel::Loops
Akron93d620e2016-02-05 19:40:05 +010018
Akron941c1a62016-02-23 17:41:41 +010019# CHANGES:
20# ----------------------------------------------------------
21# 2013/11/25
22# - Initial release
23#
24# 2014/10/29
25# - Merges foundry data to create indexer friendly documents
26#
Akron93d620e2016-02-05 19:40:05 +010027# 2016/02/04
28# - renamed to korapxml2krill
29# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010030#
31# 2016/02/12
32# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010033# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010034#
35# 2016/02/14
36# - Added version information
Akron941c1a62016-02-23 17:41:41 +010037# - Added support for archive files
38#
39# 2016/02/15
40# - Fixed temporary directory bug
41# - Improved skipping before unzipping
42# - Added EXPERIMENTAL concurrency support
43#
44# 2016/02/23
45# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010046#
47# 2016/02/27
48# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010049#
50# 2016/03/17
51# - Added meta switch
Akron941c1a62016-02-23 17:41:41 +010052# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010053
Akron35db6e32016-03-17 22:42:22 +010054our $LAST_CHANGE = '2016/03/17';
Akron941c1a62016-02-23 17:41:41 +010055our $LOCAL = $FindBin::Bin;
56our $VERSION_MSG = <<"VERSION";
57Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
58VERSION
59
60
61# Parse comand
62my $cmd;
63our @ARGV;
64if ($ARGV[0] && index($ARGV[0], '-') != 0) {
65 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010066};
Akron93d620e2016-02-05 19:40:05 +010067
Akrone10ad322016-02-27 10:54:26 +010068my (@skip, @sigle);
Akron35db6e32016-03-17 22:42:22 +010069my $text;
Akrone10ad322016-02-27 10:54:26 +010070
Akron941c1a62016-02-23 17:41:41 +010071# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000072GetOptions(
Akron941c1a62016-02-23 17:41:41 +010073 'input|i=s' => \(my $input),
74 'output|o=s' => \(my $output),
75 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010076# 'human|m' => \(my $text),
77 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010078 'token|t=s' => \(my $token_base),
79 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010080 'skip|s=s' => \@skip,
81 'sigle|sg=s' => \@sigle,
Akron941c1a62016-02-23 17:41:41 +010082 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010083 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010084 'primary|p!' => \(my $primary),
85 'pretty|y' => \(my $pretty),
86 'jobs|j=i' => \(my $jobs = 0),
87 'help|h' => sub {
88 pod2usage(
89 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
90 -verbose => 99,
91 -msg => $VERSION_MSG,
92 );
93 },
94 'version|v' => sub {
95 pod2usage(
96 -verbose => 0,
97 -msg => $VERSION_MSG
98 )
99 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000100);
101
Akron941c1a62016-02-23 17:41:41 +0100102my %ERROR_HASH = (
103 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
104 -verbose => 99,
105 -msg => $VERSION_MSG,
106 -exit => 1
107);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000108
Akron941c1a62016-02-23 17:41:41 +0100109# Input has to be defined
110pod2usage(%ERROR_HASH) unless $input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000111
Nils Diewald7364d1f2013-11-05 19:26:35 +0000112
Akron941c1a62016-02-23 17:41:41 +0100113# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000114Log::Log4perl->init({
115 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
116 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
117 'log4perl.appender.STDERR.layout' => 'PatternLayout',
118 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
119});
120
121my $log = Log::Log4perl->get_logger('main');
122
Akron941c1a62016-02-23 17:41:41 +0100123
124# Get file name based on path information
125sub get_file_name ($) {
126 my $file = shift;
127 $file =~ s/^?\/?$input//;
128 $file =~ tr/\//-/;
129 $file =~ s{^-+}{};
130 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000131};
132
Akron941c1a62016-02-23 17:41:41 +0100133
134# Write file
135sub write_file {
136 my $anno = shift;
137 my $file = get_file_name $anno;
138
139 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
140
141 my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
142 $anno . ' -o ' . $output . '/' . $file . '.json';
143 $call .= '.gz -z' if $gzip;
Akron35db6e32016-03-17 22:42:22 +0100144# $call .= ' -m' if $text;
145 $call .= ' -m ' . $meta if $meta;
Akron941c1a62016-02-23 17:41:41 +0100146 $call .= ' -w' if $overwrite;
147 $call .= ' -t ' . $token_base if $token_base;
148 $call .= ' -l ' . $log_level if $log_level;
149 $call .= ' --no-primary ' if $primary;
150 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100151 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100152 $call .= ' -s ' . $_ foreach @skip;
153 system($call);
154 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000155};
156
Nils Diewald2db9ad02013-10-29 19:26:43 +0000157
Akrone10ad322016-02-27 10:54:26 +0100158# Convert sigle to path construct
159s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
160
Akron941c1a62016-02-23 17:41:41 +0100161# Process a single file
162unless ($cmd) {
Nils Diewald59094f22014-11-05 18:20:50 +0000163
Akron941c1a62016-02-23 17:41:41 +0100164 # Can't print gzip to STDOUT
165 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000166
Akron941c1a62016-02-23 17:41:41 +0100167 my %skip;
168 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000169
170
Akron941c1a62016-02-23 17:41:41 +0100171 # Ignore processing
172 if (!$overwrite && $output && -e $output) {
173 $log->trace($output . ' already exists');
174 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000175 };
Akron941c1a62016-02-23 17:41:41 +0100176
177 BEGIN {
178 $main::TIME = Benchmark->new;
179 $main::LAST_STOP = Benchmark->new;
180 };
181
182 sub stop_time {
183 my $new = Benchmark->new;
184 $log->trace(
185 'The code took: '.
186 timestr(timediff($new, $main::LAST_STOP)) .
187 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
188 );
189 $main::LAST_STOP = $new;
190 };
191
192 # Create and parse new document
193 $input =~ s{([^/])$}{$1/};
Akron35db6e32016-03-17 22:42:22 +0100194 my $doc = KorAP::XML::Krill->new(
195 path => $input,
196 meta_type => ($meta // 'I5')
197 );
Akron941c1a62016-02-23 17:41:41 +0100198
199 unless ($doc->parse) {
200 $log->warn($output . " can't be processed - no document data");
201 exit(0);
202 };
203
204 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
205 if ($token_base) {
206 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
207 };
208
209 # Get tokenization
210 my $tokens = KorAP::XML::Tokenizer->new(
211 path => $doc->path,
212 doc => $doc,
213 foundry => $token_base_foundry,
214 layer => $token_base_layer,
215 name => 'tokens'
216 );
217
218 # Unable to process base tokenization
219 unless ($tokens->parse) {
220 $log->error($output . " can't be processed - no base tokenization");
221 exit(0);
222 };
223
224 my @layers;
225 push(@layers, ['Base', 'Sentences']);
226 push(@layers, ['Base', 'Paragraphs']);
227
228 # Connexor
229 push(@layers, ['Connexor', 'Morpho']);
230 push(@layers, ['Connexor', 'Syntax']);
231 push(@layers, ['Connexor', 'Phrase']);
232 push(@layers, ['Connexor', 'Sentences']);
233
234 # CoreNLP
235 push(@layers, ['CoreNLP', 'NamedEntities']);
236 push(@layers, ['CoreNLP', 'Sentences']);
237 push(@layers, ['CoreNLP', 'Morpho']);
238 push(@layers, ['CoreNLP', 'Constituency']);
239
240 # DeReKo
241 push(@layers, ['DeReKo', 'Structure']);
242
243 # Glemm
244 push(@layers, ['Glemm', 'Morpho']);
245
246 # Malt
247 # push(@layers, ['Malt', 'Dependency']);
248
249 # Mate
250 push(@layers, ['Mate', 'Morpho']);
251 push(@layers, ['Mate', 'Dependency']);
252
253 # OpenNLP
254 push(@layers, ['OpenNLP', 'Morpho']);
255 push(@layers, ['OpenNLP', 'Sentences']);
256
257 # Schreibgebrauch
258 push(@layers, ['Sgbr', 'Lemma']);
259 push(@layers, ['Sgbr', 'Morpho']);
260
261 # TreeTagger
262 push(@layers, ['TreeTagger', 'Morpho']);
263 push(@layers, ['TreeTagger', 'Sentences']);
264
265 # XIP
266 push(@layers, ['XIP', 'Morpho']);
267 push(@layers, ['XIP', 'Constituency']);
268 push(@layers, ['XIP', 'Sentences']);
269 push(@layers, ['XIP', 'Dependency']);
270
271
272 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100273 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100274 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000275 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000276 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000277 }
278 else {
Akron941c1a62016-02-23 17:41:41 +0100279 # Add to index file - respect skipping
280 foreach my $info (@layers) {
281 # Skip if Foundry or Foundry#Layer should be skipped
282 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
283 $tokens->add(@$info);
284 stop_time;
285 };
286 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000287 };
288
Akron941c1a62016-02-23 17:41:41 +0100289 my $file;
Akron35db6e32016-03-17 22:42:22 +0100290 my $print_text = ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
Akron941c1a62016-02-23 17:41:41 +0100291
292 if ($output) {
293
294 if ($gzip) {
295 $file = IO::Compress::Gzip->new($output, Minimal => 1);
296 }
297 else {
298 $file = IO::File->new($output, "w");
299 };
300
301 $file->print($print_text);
302 $file->close;
303 }
304
305 else {
306 print $print_text . "\n";
307 };
308
309 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000310}
Nils Diewald59094f22014-11-05 18:20:50 +0000311
Akrone10ad322016-02-27 10:54:26 +0100312# Extract XML files
313elsif ($cmd eq 'extract') {
314
315 pod2usage(%ERROR_HASH) unless $output;
316
317 # TODO: Support sigles and full archives
318
319 if ($output && (!-e $output || !-d $output)) {
320 print "Directory '$output' does not exist.\n\n";
321 exit(0);
322 };
323
324 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
325
326 unless ($archive->test_unzip) {
327 print "Unzip is not installed or incompatible.\n\n";
328 exit(1);
329 };
330
331 # Test will be skipped
332
333 # Iterate over all given sigles and extract
334 foreach (@sigle) {
335 print "$_ ";
336 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
337 print "extracted.\n";
338 };
339
340 print "\n";
341 exit(1);
342 };
343}
344
Akron941c1a62016-02-23 17:41:41 +0100345# Process an archive
346elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000347
Akrone10ad322016-02-27 10:54:26 +0100348 # TODO: Support sigles
349
Akron941c1a62016-02-23 17:41:41 +0100350 pod2usage(%ERROR_HASH) unless $output;
351
352 if ($output && (!-e $output || !-d $output)) {
353 print "Directory '$output' does not exist.\n\n";
354 exit(0);
355 };
356
357 # Zero means: everything runs in the parent process
358 my $pool = Parallel::ForkManager->new($jobs);
359
360 my $count = 0; # Texts to process
361 my $iter = 1; # Current text in process
362
363 # Report on fork message
364 $pool->run_on_finish (
365 sub {
366 my ($pid, $code) = shift;
367 my $data = pop;
368 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
369 ($iter++) . "/$count]" .
370 ($code ? " $code" : '') .
371 " $$data\n";
372 }
373 );
374
375 my $t;
376 print "Reading data ...\n";
377
378 # Input is a directory
379 if (-d $input) {
380 my $it = Directory::Iterator->new($input);
381 my @dirs;
382 my $dir;
383
384 while (1) {
385 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
386 push @dirs, $dir;
387 $it->prune;
388 };
389 last unless $it->next;
390 };
391
392 print "Start processing ...\n";
393 $t = Benchmark->new;
394 $count = scalar @dirs;
395
396 DIRECTORY_LOOP:
397 for (my $i = 0; $i < $count; $i++) {
398
399 unless ($overwrite) {
400 my $filename = catfile(
401 $output,
402 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
403 );
404
405 if (-e $filename) {
406 $iter++;
407 print "Skip $filename\n";
408 next;
409 };
410 };
411
412 # Get the next fork
413 my $pid = $pool->start and next DIRECTORY_LOOP;
414 my $msg;
415
416 $msg = write_file($dirs[$i]);
417 $pool->finish(0, \$msg);
418 };
419 }
420
421 # Input is a file
422 elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
423 unless ($archive->test_unzip) {
424 print "Unzip is not installed or incompatible.\n\n";
425 exit(1);
426 };
427
428 unless ($archive->test) {
429 print "Zip archive not compatible.\n\n";
430 exit(1);
431 };
432
433 print "Start processing ...\n";
434 $t = Benchmark->new;
435 my @dirs = $archive->list_texts;
436 $count = scalar @dirs;
437
438 ARCHIVE_LOOP:
439 for (my $i = 0; $i < $count; $i++) {
440
441 # Split path information
442 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
443
444 unless ($overwrite) {
445 my $filename = catfile(
446 $output,
447 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
448 );
449
450 if (-e $filename) {
451 $iter++;
452 print "Skip $filename\n";
453 next;
454 };
455 };
456
457 # Get the next fork
458 my $pid = $pool->start and next ARCHIVE_LOOP;
459
460 # Create temporary file
461 my $temp = File::Temp->newdir;
462
463 my $msg;
464
465 # Extract from archive
466 if ($archive->extract($dirs[$i], $temp)) {
467
468 # Create corpus directory
469 $input = catdir("$temp", $corpus);
470
471 # Temporary directory
472 my $dir = catdir($input, $doc, $text);
473
474 # Write file
475 $msg = write_file($dir);
476
477 $temp = undef;
478 $pool->finish(0, \$msg);
479 }
480 else {
481
482 $temp = undef;
483 $msg = "Unable to extract " . $dirs[$i] . "\n";
484 $pool->finish(1, \$msg);
485 };
486 };
487 }
488
489 else {
490 print "Input is neither a directory nor an archive.\n\n";
491 };
492
493 $pool->wait_all_children;
494
495 print "Done.\n";
496 print timestr(timediff(Benchmark->new, $t))."\n\n";
497}
498
499# Unknown command
500else {
501 warn "Unknown command '$cmd'.\n\n";
502 pod2usage(%ERROR_HASH);
503}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000504
505__END__
Akron941c1a62016-02-23 17:41:41 +0100506
507=pod
508
509=encoding utf8
510
511=head1 NAME
512
Akronf7ad89e2016-03-16 18:22:47 +0100513korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100514
515
516=head1 SYNOPSIS
517
Akronc13a1702016-03-15 19:33:14 +0100518 $ korapxml2krill -z --input <directory> --output <filename>
519 $ korapxml2krill archive -z --input <directory> --output <directory>
520 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100521
522
523=head1 DESCRIPTION
524
525L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
526compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100527The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100528
529
530=head1 INSTALLATION
531
532The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
533
534 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
535
Akronc13a1702016-03-15 19:33:14 +0100536In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100537be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100538
539
540=head1 ARGUMENTS
541
542=over 2
543
544=item B<archive>
545
Akrone10ad322016-02-27 10:54:26 +0100546Process an archive as a Zip-file or a folder of KorAP-XML documents.
547
548=item B<extract>
549
550Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100551
552=back
553
554
555=head1 OPTIONS
556
557=over 2
558
559=item B<--input|-i> <directory|file>
560
Akronf7ad89e2016-03-16 18:22:47 +0100561Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100562
563=item B<--output|-o> <directory|file>
564
565Output folder for archive processing or
566document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100567writes to C<STDOUT> by default
568(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100569
570=item B<--overwrite|-w>
571
572Overwrite files that already exist.
573
574=item B<--token|-t> <foundry>[#<file>]
575
576Define the default tokenization by specifying
577the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100578of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100579
580=item B<--skip|-s> <foundry>[#<layer>]
581
Akronf7ad89e2016-03-16 18:22:47 +0100582Skip specific annotations by specifying the foundry
583(and optionally the layer with a C<#>-prefix),
584e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100585Can be set multiple times.
586
Akronc13a1702016-03-15 19:33:14 +0100587=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100588
Akronf7ad89e2016-03-16 18:22:47 +0100589Convert specific annotations by specifying the foundry
590(and optionally the layer with a C<#>-prefix),
591e.g. C<Mate> or C<Mate#Morpho>.
592Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100593
594=item B<--primary|-p>
595
Akronc13a1702016-03-15 19:33:14 +0100596Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100597Can be flagged using C<--no-primary> as well.
598This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100599
600=item B<--jobs|-j>
601
602Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100603for archive processing.
604Defaults to C<0>.
605This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100606
Akron35db6e32016-03-17 22:42:22 +0100607=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100608
Akron35db6e32016-03-17 22:42:22 +0100609Define the metadata parser to use. Defaults to C<I5>.
610Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
611This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100612
613=item B<--pretty|-y>
614
Akronc13a1702016-03-15 19:33:14 +0100615Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100616This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100617
618=item B<--gzip|-z>
619
Akronf7ad89e2016-03-16 18:22:47 +0100620Compress the output.
621Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100622
Akrone10ad322016-02-27 10:54:26 +0100623=item B<--sigle|-sg>
624
625Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100626Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100627I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100628
Akron941c1a62016-02-23 17:41:41 +0100629=item B<--log|-l>
630
631The L<Log4perl> log level, defaults to C<ERROR>.
632
633=item B<--help|-h>
634
635Print this document.
636
637=item B<--version|-v>
638
639Print version information.
640
641=back
642
Akronc13a1702016-03-15 19:33:14 +0100643=head1 ANNOTATION SUPPORT
644
645L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
646developed in the KorAP project that are part of the KorAP preprocessing pipeline.
647The base foundry with paragraphs, sentences, and the text element are mandatory for
648L<Krill|https://github.com/KorAP/Krill>.
649
Akronf7ad89e2016-03-16 18:22:47 +0100650=over 2
Akronc13a1702016-03-15 19:33:14 +0100651
652=item B<Base>
653
654=over 4
655
Akronf7ad89e2016-03-16 18:22:47 +0100656=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100657
Akronf7ad89e2016-03-16 18:22:47 +0100658=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100659
660=back
661
662=item B<Connexor>
663
664=over 4
665
Akronf7ad89e2016-03-16 18:22:47 +0100666=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100667
Akronf7ad89e2016-03-16 18:22:47 +0100668=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100669
Akronf7ad89e2016-03-16 18:22:47 +0100670=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100671
Akronf7ad89e2016-03-16 18:22:47 +0100672=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100673
674=back
675
676=item B<CoreNLP>
677
678=over 4
679
Akronf7ad89e2016-03-16 18:22:47 +0100680=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100681
Akronf7ad89e2016-03-16 18:22:47 +0100682=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100683
Akronf7ad89e2016-03-16 18:22:47 +0100684=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100685
Akronf7ad89e2016-03-16 18:22:47 +0100686=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100687
688=back
689
690=item B<DeReKo>
691
692=over 4
693
Akronf7ad89e2016-03-16 18:22:47 +0100694=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100695
696=back
697
698=item B<Glemm>
699
700=over 4
701
Akronf7ad89e2016-03-16 18:22:47 +0100702=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100703
704=back
705
706=item B<Mate>
707
708=over 4
709
Akronf7ad89e2016-03-16 18:22:47 +0100710=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100711
Akronf7ad89e2016-03-16 18:22:47 +0100712=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100713
714=back
715
716=item B<OpenNLP>
717
718=over 4
719
Akronf7ad89e2016-03-16 18:22:47 +0100720=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100721
Akronf7ad89e2016-03-16 18:22:47 +0100722=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100723
724=back
725
726=item B<Sgbr>
727
728=over 4
729
Akronf7ad89e2016-03-16 18:22:47 +0100730=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100731
Akronf7ad89e2016-03-16 18:22:47 +0100732=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100733
734=back
735
736=item B<TreeTagger>
737
738=over 4
739
Akronf7ad89e2016-03-16 18:22:47 +0100740=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100741
Akronf7ad89e2016-03-16 18:22:47 +0100742=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100743
744=back
745
746=item B<XIP>
747
748=over 4
749
Akronf7ad89e2016-03-16 18:22:47 +0100750=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100751
Akronf7ad89e2016-03-16 18:22:47 +0100752=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100753
Akronf7ad89e2016-03-16 18:22:47 +0100754=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100755
756=back
757
758=back
759
760More importers are in preparation.
761New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
762See the built-in annotation importers as examples.
763
Akron941c1a62016-02-23 17:41:41 +0100764=head1 AVAILABILITY
765
766 https://github.com/KorAP/KorAP-XML-Krill
767
768
769=head1 COPYRIGHT AND LICENSE
770
771Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100772
Akron941c1a62016-02-23 17:41:41 +0100773Author: L<Nils Diewald|http://nils-diewald.de/>
774
775L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
776Corpus Analysis Platform at the
777L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
778member of the
779L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
780
781This program is free software published under the
782L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
783
784=cut