blob: 0d7fb404df593ad8ddce0430fbbc7d9b1d2dfb49 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
12use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010013use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010014use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010016use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010017# TODO: use Parallel::Loops
Akron93d620e2016-02-05 19:40:05 +010018
Akron941c1a62016-02-23 17:41:41 +010019# CHANGES:
20# ----------------------------------------------------------
21# 2013/11/25
22# - Initial release
23#
24# 2014/10/29
25# - Merges foundry data to create indexer friendly documents
26#
Akron93d620e2016-02-05 19:40:05 +010027# 2016/02/04
28# - renamed to korapxml2krill
29# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010030#
31# 2016/02/12
32# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010033# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010034#
35# 2016/02/14
36# - Added version information
Akron941c1a62016-02-23 17:41:41 +010037# - Added support for archive files
38#
39# 2016/02/15
40# - Fixed temporary directory bug
41# - Improved skipping before unzipping
42# - Added EXPERIMENTAL concurrency support
43#
44# 2016/02/23
45# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010046#
47# 2016/02/27
48# - Added extract function
Akron941c1a62016-02-23 17:41:41 +010049# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010050
Akronee130192016-03-03 18:24:53 +010051our $LAST_CHANGE = '2016/03/02';
Akron941c1a62016-02-23 17:41:41 +010052our $LOCAL = $FindBin::Bin;
53our $VERSION_MSG = <<"VERSION";
54Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
55VERSION
56
57
58# Parse comand
59my $cmd;
60our @ARGV;
61if ($ARGV[0] && index($ARGV[0], '-') != 0) {
62 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010063};
Akron93d620e2016-02-05 19:40:05 +010064
Akrone10ad322016-02-27 10:54:26 +010065my (@skip, @sigle);
66
Akron941c1a62016-02-23 17:41:41 +010067# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000068GetOptions(
Akron941c1a62016-02-23 17:41:41 +010069 'input|i=s' => \(my $input),
70 'output|o=s' => \(my $output),
71 'overwrite|w' => \(my $overwrite),
72 'human|m' => \(my $text),
73 'token|t=s' => \(my $token_base),
74 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010075 'skip|s=s' => \@skip,
76 'sigle|sg=s' => \@sigle,
Akron941c1a62016-02-23 17:41:41 +010077 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010078 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010079 'primary|p!' => \(my $primary),
80 'pretty|y' => \(my $pretty),
81 'jobs|j=i' => \(my $jobs = 0),
82 'help|h' => sub {
83 pod2usage(
84 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
85 -verbose => 99,
86 -msg => $VERSION_MSG,
87 );
88 },
89 'version|v' => sub {
90 pod2usage(
91 -verbose => 0,
92 -msg => $VERSION_MSG
93 )
94 }
Nils Diewald7364d1f2013-11-05 19:26:35 +000095);
96
Akron941c1a62016-02-23 17:41:41 +010097my %ERROR_HASH = (
98 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
99 -verbose => 99,
100 -msg => $VERSION_MSG,
101 -exit => 1
102);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000103
Akron941c1a62016-02-23 17:41:41 +0100104# Input has to be defined
105pod2usage(%ERROR_HASH) unless $input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000106
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107
Akron941c1a62016-02-23 17:41:41 +0100108# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000109Log::Log4perl->init({
110 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
111 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
112 'log4perl.appender.STDERR.layout' => 'PatternLayout',
113 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
114});
115
116my $log = Log::Log4perl->get_logger('main');
117
Akron941c1a62016-02-23 17:41:41 +0100118
119# Get file name based on path information
120sub get_file_name ($) {
121 my $file = shift;
122 $file =~ s/^?\/?$input//;
123 $file =~ tr/\//-/;
124 $file =~ s{^-+}{};
125 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000126};
127
Akron941c1a62016-02-23 17:41:41 +0100128
129# Write file
130sub write_file {
131 my $anno = shift;
132 my $file = get_file_name $anno;
133
134 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
135
136 my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
137 $anno . ' -o ' . $output . '/' . $file . '.json';
138 $call .= '.gz -z' if $gzip;
139 $call .= ' -m' if $text;
140 $call .= ' -w' if $overwrite;
141 $call .= ' -t ' . $token_base if $token_base;
142 $call .= ' -l ' . $log_level if $log_level;
143 $call .= ' --no-primary ' if $primary;
144 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100145 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100146 $call .= ' -s ' . $_ foreach @skip;
147 system($call);
148 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000149};
150
Nils Diewald2db9ad02013-10-29 19:26:43 +0000151
Akrone10ad322016-02-27 10:54:26 +0100152# Convert sigle to path construct
153s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
154
Akron941c1a62016-02-23 17:41:41 +0100155# Process a single file
156unless ($cmd) {
Nils Diewald59094f22014-11-05 18:20:50 +0000157
Akron941c1a62016-02-23 17:41:41 +0100158 # Can't print gzip to STDOUT
159 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000160
Akron941c1a62016-02-23 17:41:41 +0100161 my %skip;
162 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000163
164
Akron941c1a62016-02-23 17:41:41 +0100165 # Ignore processing
166 if (!$overwrite && $output && -e $output) {
167 $log->trace($output . ' already exists');
168 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000169 };
Akron941c1a62016-02-23 17:41:41 +0100170
171 BEGIN {
172 $main::TIME = Benchmark->new;
173 $main::LAST_STOP = Benchmark->new;
174 };
175
176 sub stop_time {
177 my $new = Benchmark->new;
178 $log->trace(
179 'The code took: '.
180 timestr(timediff($new, $main::LAST_STOP)) .
181 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
182 );
183 $main::LAST_STOP = $new;
184 };
185
186 # Create and parse new document
187 $input =~ s{([^/])$}{$1/};
188 my $doc = KorAP::XML::Krill->new( path => $input );
189
190 unless ($doc->parse) {
191 $log->warn($output . " can't be processed - no document data");
192 exit(0);
193 };
194
195 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
196 if ($token_base) {
197 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
198 };
199
200 # Get tokenization
201 my $tokens = KorAP::XML::Tokenizer->new(
202 path => $doc->path,
203 doc => $doc,
204 foundry => $token_base_foundry,
205 layer => $token_base_layer,
206 name => 'tokens'
207 );
208
209 # Unable to process base tokenization
210 unless ($tokens->parse) {
211 $log->error($output . " can't be processed - no base tokenization");
212 exit(0);
213 };
214
215 my @layers;
216 push(@layers, ['Base', 'Sentences']);
217 push(@layers, ['Base', 'Paragraphs']);
218
219 # Connexor
220 push(@layers, ['Connexor', 'Morpho']);
221 push(@layers, ['Connexor', 'Syntax']);
222 push(@layers, ['Connexor', 'Phrase']);
223 push(@layers, ['Connexor', 'Sentences']);
224
225 # CoreNLP
226 push(@layers, ['CoreNLP', 'NamedEntities']);
227 push(@layers, ['CoreNLP', 'Sentences']);
228 push(@layers, ['CoreNLP', 'Morpho']);
229 push(@layers, ['CoreNLP', 'Constituency']);
230
231 # DeReKo
232 push(@layers, ['DeReKo', 'Structure']);
233
234 # Glemm
235 push(@layers, ['Glemm', 'Morpho']);
236
237 # Malt
238 # push(@layers, ['Malt', 'Dependency']);
239
240 # Mate
241 push(@layers, ['Mate', 'Morpho']);
242 push(@layers, ['Mate', 'Dependency']);
243
244 # OpenNLP
245 push(@layers, ['OpenNLP', 'Morpho']);
246 push(@layers, ['OpenNLP', 'Sentences']);
247
248 # Schreibgebrauch
249 push(@layers, ['Sgbr', 'Lemma']);
250 push(@layers, ['Sgbr', 'Morpho']);
251
252 # TreeTagger
253 push(@layers, ['TreeTagger', 'Morpho']);
254 push(@layers, ['TreeTagger', 'Sentences']);
255
256 # XIP
257 push(@layers, ['XIP', 'Morpho']);
258 push(@layers, ['XIP', 'Constituency']);
259 push(@layers, ['XIP', 'Sentences']);
260 push(@layers, ['XIP', 'Dependency']);
261
262
263 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100264 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100265 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000266 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000267 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000268 }
269 else {
Akron941c1a62016-02-23 17:41:41 +0100270 # Add to index file - respect skipping
271 foreach my $info (@layers) {
272 # Skip if Foundry or Foundry#Layer should be skipped
273 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
274 $tokens->add(@$info);
275 stop_time;
276 };
277 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000278 };
279
Akron941c1a62016-02-23 17:41:41 +0100280 my $file;
281
282 my $print_text = $text ? $tokens->to_string($primary) :
283 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
284
285 if ($output) {
286
287 if ($gzip) {
288 $file = IO::Compress::Gzip->new($output, Minimal => 1);
289 }
290 else {
291 $file = IO::File->new($output, "w");
292 };
293
294 $file->print($print_text);
295 $file->close;
296 }
297
298 else {
299 print $print_text . "\n";
300 };
301
302 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000303}
Nils Diewald59094f22014-11-05 18:20:50 +0000304
Akrone10ad322016-02-27 10:54:26 +0100305# Extract XML files
306elsif ($cmd eq 'extract') {
307
308 pod2usage(%ERROR_HASH) unless $output;
309
310 # TODO: Support sigles and full archives
311
312 if ($output && (!-e $output || !-d $output)) {
313 print "Directory '$output' does not exist.\n\n";
314 exit(0);
315 };
316
317 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
318
319 unless ($archive->test_unzip) {
320 print "Unzip is not installed or incompatible.\n\n";
321 exit(1);
322 };
323
324 # Test will be skipped
325
326 # Iterate over all given sigles and extract
327 foreach (@sigle) {
328 print "$_ ";
329 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
330 print "extracted.\n";
331 };
332
333 print "\n";
334 exit(1);
335 };
336}
337
Akron941c1a62016-02-23 17:41:41 +0100338# Process an archive
339elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000340
Akrone10ad322016-02-27 10:54:26 +0100341 # TODO: Support sigles
342
Akron941c1a62016-02-23 17:41:41 +0100343 pod2usage(%ERROR_HASH) unless $output;
344
345 if ($output && (!-e $output || !-d $output)) {
346 print "Directory '$output' does not exist.\n\n";
347 exit(0);
348 };
349
350 # Zero means: everything runs in the parent process
351 my $pool = Parallel::ForkManager->new($jobs);
352
353 my $count = 0; # Texts to process
354 my $iter = 1; # Current text in process
355
356 # Report on fork message
357 $pool->run_on_finish (
358 sub {
359 my ($pid, $code) = shift;
360 my $data = pop;
361 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
362 ($iter++) . "/$count]" .
363 ($code ? " $code" : '') .
364 " $$data\n";
365 }
366 );
367
368 my $t;
369 print "Reading data ...\n";
370
371 # Input is a directory
372 if (-d $input) {
373 my $it = Directory::Iterator->new($input);
374 my @dirs;
375 my $dir;
376
377 while (1) {
378 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
379 push @dirs, $dir;
380 $it->prune;
381 };
382 last unless $it->next;
383 };
384
385 print "Start processing ...\n";
386 $t = Benchmark->new;
387 $count = scalar @dirs;
388
389 DIRECTORY_LOOP:
390 for (my $i = 0; $i < $count; $i++) {
391
392 unless ($overwrite) {
393 my $filename = catfile(
394 $output,
395 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
396 );
397
398 if (-e $filename) {
399 $iter++;
400 print "Skip $filename\n";
401 next;
402 };
403 };
404
405 # Get the next fork
406 my $pid = $pool->start and next DIRECTORY_LOOP;
407 my $msg;
408
409 $msg = write_file($dirs[$i]);
410 $pool->finish(0, \$msg);
411 };
412 }
413
414 # Input is a file
415 elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
416 unless ($archive->test_unzip) {
417 print "Unzip is not installed or incompatible.\n\n";
418 exit(1);
419 };
420
421 unless ($archive->test) {
422 print "Zip archive not compatible.\n\n";
423 exit(1);
424 };
425
426 print "Start processing ...\n";
427 $t = Benchmark->new;
428 my @dirs = $archive->list_texts;
429 $count = scalar @dirs;
430
431 ARCHIVE_LOOP:
432 for (my $i = 0; $i < $count; $i++) {
433
434 # Split path information
435 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
436
437 unless ($overwrite) {
438 my $filename = catfile(
439 $output,
440 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
441 );
442
443 if (-e $filename) {
444 $iter++;
445 print "Skip $filename\n";
446 next;
447 };
448 };
449
450 # Get the next fork
451 my $pid = $pool->start and next ARCHIVE_LOOP;
452
453 # Create temporary file
454 my $temp = File::Temp->newdir;
455
456 my $msg;
457
458 # Extract from archive
459 if ($archive->extract($dirs[$i], $temp)) {
460
461 # Create corpus directory
462 $input = catdir("$temp", $corpus);
463
464 # Temporary directory
465 my $dir = catdir($input, $doc, $text);
466
467 # Write file
468 $msg = write_file($dir);
469
470 $temp = undef;
471 $pool->finish(0, \$msg);
472 }
473 else {
474
475 $temp = undef;
476 $msg = "Unable to extract " . $dirs[$i] . "\n";
477 $pool->finish(1, \$msg);
478 };
479 };
480 }
481
482 else {
483 print "Input is neither a directory nor an archive.\n\n";
484 };
485
486 $pool->wait_all_children;
487
488 print "Done.\n";
489 print timestr(timediff(Benchmark->new, $t))."\n\n";
490}
491
492# Unknown command
493else {
494 warn "Unknown command '$cmd'.\n\n";
495 pod2usage(%ERROR_HASH);
496}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000497
498__END__
Akron941c1a62016-02-23 17:41:41 +0100499
500=pod
501
502=encoding utf8
503
504=head1 NAME
505
Akronf7ad89e2016-03-16 18:22:47 +0100506korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100507
508
509=head1 SYNOPSIS
510
Akronc13a1702016-03-15 19:33:14 +0100511 $ korapxml2krill -z --input <directory> --output <filename>
512 $ korapxml2krill archive -z --input <directory> --output <directory>
513 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100514
515
516=head1 DESCRIPTION
517
518L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
519compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100520The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100521
522
523=head1 INSTALLATION
524
525The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
526
527 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
528
Akronc13a1702016-03-15 19:33:14 +0100529In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100530be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100531
532
533=head1 ARGUMENTS
534
535=over 2
536
537=item B<archive>
538
Akrone10ad322016-02-27 10:54:26 +0100539Process an archive as a Zip-file or a folder of KorAP-XML documents.
540
541=item B<extract>
542
543Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100544
545=back
546
547
548=head1 OPTIONS
549
550=over 2
551
552=item B<--input|-i> <directory|file>
553
Akronf7ad89e2016-03-16 18:22:47 +0100554Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100555
556=item B<--output|-o> <directory|file>
557
558Output folder for archive processing or
559document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100560writes to C<STDOUT> by default
561(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100562
563=item B<--overwrite|-w>
564
565Overwrite files that already exist.
566
567=item B<--token|-t> <foundry>[#<file>]
568
569Define the default tokenization by specifying
570the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100571of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100572
573=item B<--skip|-s> <foundry>[#<layer>]
574
Akronf7ad89e2016-03-16 18:22:47 +0100575Skip specific annotations by specifying the foundry
576(and optionally the layer with a C<#>-prefix),
577e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100578Can be set multiple times.
579
Akronc13a1702016-03-15 19:33:14 +0100580=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100581
Akronf7ad89e2016-03-16 18:22:47 +0100582Convert specific annotations by specifying the foundry
583(and optionally the layer with a C<#>-prefix),
584e.g. C<Mate> or C<Mate#Morpho>.
585Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100586
587=item B<--primary|-p>
588
Akronc13a1702016-03-15 19:33:14 +0100589Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100590Can be flagged using C<--no-primary> as well.
591This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100592
593=item B<--jobs|-j>
594
595Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100596for archive processing.
597Defaults to C<0>.
598This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100599
600=item B<--human|-m>
601
Akronc13a1702016-03-15 19:33:14 +0100602Represent the data in an alternative human readible format.
Akronf7ad89e2016-03-16 18:22:47 +0100603This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100604
605=item B<--pretty|-y>
606
Akronc13a1702016-03-15 19:33:14 +0100607Pretty print JSON output. Defaults to C<false>.
Akron941c1a62016-02-23 17:41:41 +0100608
609=item B<--gzip|-z>
610
Akronf7ad89e2016-03-16 18:22:47 +0100611Compress the output.
612Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100613
Akrone10ad322016-02-27 10:54:26 +0100614=item B<--sigle|-sg>
615
616Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100617Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100618I<Currently only supported on C<extract>.>
Akrone10ad322016-02-27 10:54:26 +0100619
Akron941c1a62016-02-23 17:41:41 +0100620=item B<--log|-l>
621
622The L<Log4perl> log level, defaults to C<ERROR>.
623
624=item B<--help|-h>
625
626Print this document.
627
628=item B<--version|-v>
629
630Print version information.
631
632=back
633
Akronc13a1702016-03-15 19:33:14 +0100634=head1 ANNOTATION SUPPORT
635
636L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
637developed in the KorAP project that are part of the KorAP preprocessing pipeline.
638The base foundry with paragraphs, sentences, and the text element are mandatory for
639L<Krill|https://github.com/KorAP/Krill>.
640
Akronf7ad89e2016-03-16 18:22:47 +0100641=over 2
Akronc13a1702016-03-15 19:33:14 +0100642
643=item B<Base>
644
645=over 4
646
Akronf7ad89e2016-03-16 18:22:47 +0100647=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100648
Akronf7ad89e2016-03-16 18:22:47 +0100649=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100650
651=back
652
653=item B<Connexor>
654
655=over 4
656
Akronf7ad89e2016-03-16 18:22:47 +0100657=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100658
Akronf7ad89e2016-03-16 18:22:47 +0100659=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100660
Akronf7ad89e2016-03-16 18:22:47 +0100661=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100662
Akronf7ad89e2016-03-16 18:22:47 +0100663=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100664
665=back
666
667=item B<CoreNLP>
668
669=over 4
670
Akronf7ad89e2016-03-16 18:22:47 +0100671=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100672
Akronf7ad89e2016-03-16 18:22:47 +0100673=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100674
Akronf7ad89e2016-03-16 18:22:47 +0100675=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100676
Akronf7ad89e2016-03-16 18:22:47 +0100677=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100678
679=back
680
681=item B<DeReKo>
682
683=over 4
684
Akronf7ad89e2016-03-16 18:22:47 +0100685=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100686
687=back
688
689=item B<Glemm>
690
691=over 4
692
Akronf7ad89e2016-03-16 18:22:47 +0100693=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100694
695=back
696
697=item B<Mate>
698
699=over 4
700
Akronf7ad89e2016-03-16 18:22:47 +0100701=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100702
Akronf7ad89e2016-03-16 18:22:47 +0100703=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100704
705=back
706
707=item B<OpenNLP>
708
709=over 4
710
Akronf7ad89e2016-03-16 18:22:47 +0100711=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100712
Akronf7ad89e2016-03-16 18:22:47 +0100713=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100714
715=back
716
717=item B<Sgbr>
718
719=over 4
720
Akronf7ad89e2016-03-16 18:22:47 +0100721=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100722
Akronf7ad89e2016-03-16 18:22:47 +0100723=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100724
725=back
726
727=item B<TreeTagger>
728
729=over 4
730
Akronf7ad89e2016-03-16 18:22:47 +0100731=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100732
Akronf7ad89e2016-03-16 18:22:47 +0100733=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100734
735=back
736
737=item B<XIP>
738
739=over 4
740
Akronf7ad89e2016-03-16 18:22:47 +0100741=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100742
Akronf7ad89e2016-03-16 18:22:47 +0100743=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100744
Akronf7ad89e2016-03-16 18:22:47 +0100745=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100746
747=back
748
749=back
750
751More importers are in preparation.
752New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
753See the built-in annotation importers as examples.
754
Akron941c1a62016-02-23 17:41:41 +0100755=head1 AVAILABILITY
756
757 https://github.com/KorAP/KorAP-XML-Krill
758
759
760=head1 COPYRIGHT AND LICENSE
761
762Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100763
Akron941c1a62016-02-23 17:41:41 +0100764Author: L<Nils Diewald|http://nils-diewald.de/>
765
766L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
767Corpus Analysis Platform at the
768L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
769member of the
770L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
771
772This program is free software published under the
773L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
774
775=cut