blob: 35ec44ca71ea5dbd4fcd0e86b952f3d775403a61 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
12use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010013use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010014use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Tokenizer;
Akron941c1a62016-02-23 17:41:41 +010016use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010017# TODO: use Parallel::Loops
Akron93d620e2016-02-05 19:40:05 +010018
Akron941c1a62016-02-23 17:41:41 +010019# CHANGES:
20# ----------------------------------------------------------
21# 2013/11/25
22# - Initial release
23#
24# 2014/10/29
25# - Merges foundry data to create indexer friendly documents
26#
Akron93d620e2016-02-05 19:40:05 +010027# 2016/02/04
28# - renamed to korapxml2krill
29# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010030#
31# 2016/02/12
32# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010033# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010034#
35# 2016/02/14
36# - Added version information
Akron941c1a62016-02-23 17:41:41 +010037# - Added support for archive files
38#
39# 2016/02/15
40# - Fixed temporary directory bug
41# - Improved skipping before unzipping
42# - Added EXPERIMENTAL concurrency support
43#
44# 2016/02/23
45# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010046#
47# 2016/02/27
48# - Added extract function
Akron941c1a62016-02-23 17:41:41 +010049# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010050
Akronee130192016-03-03 18:24:53 +010051our $LAST_CHANGE = '2016/03/02';
Akron941c1a62016-02-23 17:41:41 +010052our $LOCAL = $FindBin::Bin;
53our $VERSION_MSG = <<"VERSION";
54Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
55VERSION
56
57
58# Parse comand
59my $cmd;
60our @ARGV;
61if ($ARGV[0] && index($ARGV[0], '-') != 0) {
62 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010063};
Akron93d620e2016-02-05 19:40:05 +010064
Akrone10ad322016-02-27 10:54:26 +010065my (@skip, @sigle);
66
Akron941c1a62016-02-23 17:41:41 +010067# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000068GetOptions(
Akron941c1a62016-02-23 17:41:41 +010069 'input|i=s' => \(my $input),
70 'output|o=s' => \(my $output),
71 'overwrite|w' => \(my $overwrite),
72 'human|m' => \(my $text),
73 'token|t=s' => \(my $token_base),
74 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010075 'skip|s=s' => \@skip,
76 'sigle|sg=s' => \@sigle,
Akron941c1a62016-02-23 17:41:41 +010077 'log|l=s' => \(my $log_level = 'ERROR'),
Akronc13a1702016-03-15 19:33:14 +010078 'anno|a=s' => \(my @anno),
Akron941c1a62016-02-23 17:41:41 +010079 'primary|p!' => \(my $primary),
80 'pretty|y' => \(my $pretty),
81 'jobs|j=i' => \(my $jobs = 0),
82 'help|h' => sub {
83 pod2usage(
84 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
85 -verbose => 99,
86 -msg => $VERSION_MSG,
87 );
88 },
89 'version|v' => sub {
90 pod2usage(
91 -verbose => 0,
92 -msg => $VERSION_MSG
93 )
94 }
Nils Diewald7364d1f2013-11-05 19:26:35 +000095);
96
Akron941c1a62016-02-23 17:41:41 +010097my %ERROR_HASH = (
98 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
99 -verbose => 99,
100 -msg => $VERSION_MSG,
101 -exit => 1
102);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000103
Akron941c1a62016-02-23 17:41:41 +0100104# Input has to be defined
105pod2usage(%ERROR_HASH) unless $input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000106
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107
Akron941c1a62016-02-23 17:41:41 +0100108# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000109Log::Log4perl->init({
110 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
111 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
112 'log4perl.appender.STDERR.layout' => 'PatternLayout',
113 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
114});
115
116my $log = Log::Log4perl->get_logger('main');
117
Akron941c1a62016-02-23 17:41:41 +0100118
119# Get file name based on path information
120sub get_file_name ($) {
121 my $file = shift;
122 $file =~ s/^?\/?$input//;
123 $file =~ tr/\//-/;
124 $file =~ s{^-+}{};
125 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000126};
127
Akron941c1a62016-02-23 17:41:41 +0100128
129# Write file
130sub write_file {
131 my $anno = shift;
132 my $file = get_file_name $anno;
133
134 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
135
136 my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
137 $anno . ' -o ' . $output . '/' . $file . '.json';
138 $call .= '.gz -z' if $gzip;
139 $call .= ' -m' if $text;
140 $call .= ' -w' if $overwrite;
141 $call .= ' -t ' . $token_base if $token_base;
142 $call .= ' -l ' . $log_level if $log_level;
143 $call .= ' --no-primary ' if $primary;
144 $call .= ' -y ' . $pretty if $pretty;
Akronc13a1702016-03-15 19:33:14 +0100145 $call .= ' -a ' . $_ foreach @anno;
Akron941c1a62016-02-23 17:41:41 +0100146 $call .= ' -s ' . $_ foreach @skip;
147 system($call);
148 return "$file";
Nils Diewald7364d1f2013-11-05 19:26:35 +0000149};
150
Nils Diewald2db9ad02013-10-29 19:26:43 +0000151
Akrone10ad322016-02-27 10:54:26 +0100152# Convert sigle to path construct
153s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
154
Akron941c1a62016-02-23 17:41:41 +0100155# Process a single file
156unless ($cmd) {
Nils Diewald59094f22014-11-05 18:20:50 +0000157
Akron941c1a62016-02-23 17:41:41 +0100158 # Can't print gzip to STDOUT
159 pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000160
Akron941c1a62016-02-23 17:41:41 +0100161 my %skip;
162 $skip{lc($_)} = 1 foreach @skip;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000163
164
Akron941c1a62016-02-23 17:41:41 +0100165 # Ignore processing
166 if (!$overwrite && $output && -e $output) {
167 $log->trace($output . ' already exists');
168 exit(0);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000169 };
Akron941c1a62016-02-23 17:41:41 +0100170
171 BEGIN {
172 $main::TIME = Benchmark->new;
173 $main::LAST_STOP = Benchmark->new;
174 };
175
176 sub stop_time {
177 my $new = Benchmark->new;
178 $log->trace(
179 'The code took: '.
180 timestr(timediff($new, $main::LAST_STOP)) .
181 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
182 );
183 $main::LAST_STOP = $new;
184 };
185
186 # Create and parse new document
187 $input =~ s{([^/])$}{$1/};
188 my $doc = KorAP::XML::Krill->new( path => $input );
189
190 unless ($doc->parse) {
191 $log->warn($output . " can't be processed - no document data");
192 exit(0);
193 };
194
195 my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
196 if ($token_base) {
197 ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
198 };
199
200 # Get tokenization
201 my $tokens = KorAP::XML::Tokenizer->new(
202 path => $doc->path,
203 doc => $doc,
204 foundry => $token_base_foundry,
205 layer => $token_base_layer,
206 name => 'tokens'
207 );
208
209 # Unable to process base tokenization
210 unless ($tokens->parse) {
211 $log->error($output . " can't be processed - no base tokenization");
212 exit(0);
213 };
214
215 my @layers;
216 push(@layers, ['Base', 'Sentences']);
217 push(@layers, ['Base', 'Paragraphs']);
218
219 # Connexor
220 push(@layers, ['Connexor', 'Morpho']);
221 push(@layers, ['Connexor', 'Syntax']);
222 push(@layers, ['Connexor', 'Phrase']);
223 push(@layers, ['Connexor', 'Sentences']);
224
225 # CoreNLP
226 push(@layers, ['CoreNLP', 'NamedEntities']);
227 push(@layers, ['CoreNLP', 'Sentences']);
228 push(@layers, ['CoreNLP', 'Morpho']);
229 push(@layers, ['CoreNLP', 'Constituency']);
230
231 # DeReKo
232 push(@layers, ['DeReKo', 'Structure']);
233
234 # Glemm
235 push(@layers, ['Glemm', 'Morpho']);
236
237 # Malt
238 # push(@layers, ['Malt', 'Dependency']);
239
240 # Mate
241 push(@layers, ['Mate', 'Morpho']);
242 push(@layers, ['Mate', 'Dependency']);
243
244 # OpenNLP
245 push(@layers, ['OpenNLP', 'Morpho']);
246 push(@layers, ['OpenNLP', 'Sentences']);
247
248 # Schreibgebrauch
249 push(@layers, ['Sgbr', 'Lemma']);
250 push(@layers, ['Sgbr', 'Morpho']);
251
252 # TreeTagger
253 push(@layers, ['TreeTagger', 'Morpho']);
254 push(@layers, ['TreeTagger', 'Sentences']);
255
256 # XIP
257 push(@layers, ['XIP', 'Morpho']);
258 push(@layers, ['XIP', 'Constituency']);
259 push(@layers, ['XIP', 'Sentences']);
260 push(@layers, ['XIP', 'Dependency']);
261
262
263 if ($skip{'#all'}) {
Akronc13a1702016-03-15 19:33:14 +0100264 foreach (@anno) {
Akron941c1a62016-02-23 17:41:41 +0100265 $tokens->add(split('#', $_));
Nils Diewald7364d1f2013-11-05 19:26:35 +0000266 stop_time;
Nils Diewald2db9ad02013-10-29 19:26:43 +0000267 };
Nils Diewald7364d1f2013-11-05 19:26:35 +0000268 }
269 else {
Akron941c1a62016-02-23 17:41:41 +0100270 # Add to index file - respect skipping
271 foreach my $info (@layers) {
272 # Skip if Foundry or Foundry#Layer should be skipped
273 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
274 $tokens->add(@$info);
275 stop_time;
276 };
277 };
Nils Diewald2db9ad02013-10-29 19:26:43 +0000278 };
279
Akron941c1a62016-02-23 17:41:41 +0100280 my $file;
281
282 my $print_text = $text ? $tokens->to_string($primary) :
283 ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
284
285 if ($output) {
286
287 if ($gzip) {
288 $file = IO::Compress::Gzip->new($output, Minimal => 1);
289 }
290 else {
291 $file = IO::File->new($output, "w");
292 };
293
294 $file->print($print_text);
295 $file->close;
296 }
297
298 else {
299 print $print_text . "\n";
300 };
301
302 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000303}
Nils Diewald59094f22014-11-05 18:20:50 +0000304
Akrone10ad322016-02-27 10:54:26 +0100305# Extract XML files
306elsif ($cmd eq 'extract') {
307
308 pod2usage(%ERROR_HASH) unless $output;
309
310 # TODO: Support sigles and full archives
311
312 if ($output && (!-e $output || !-d $output)) {
313 print "Directory '$output' does not exist.\n\n";
314 exit(0);
315 };
316
317 if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
318
319 unless ($archive->test_unzip) {
320 print "Unzip is not installed or incompatible.\n\n";
321 exit(1);
322 };
323
324 # Test will be skipped
325
326 # Iterate over all given sigles and extract
327 foreach (@sigle) {
328 print "$_ ";
329 print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
330 print "extracted.\n";
331 };
332
333 print "\n";
334 exit(1);
335 };
336}
337
Akron941c1a62016-02-23 17:41:41 +0100338# Process an archive
339elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000340
Akrone10ad322016-02-27 10:54:26 +0100341 # TODO: Support sigles
342
Akron941c1a62016-02-23 17:41:41 +0100343 pod2usage(%ERROR_HASH) unless $output;
344
345 if ($output && (!-e $output || !-d $output)) {
346 print "Directory '$output' does not exist.\n\n";
347 exit(0);
348 };
349
350 # Zero means: everything runs in the parent process
351 my $pool = Parallel::ForkManager->new($jobs);
352
353 my $count = 0; # Texts to process
354 my $iter = 1; # Current text in process
355
356 # Report on fork message
357 $pool->run_on_finish (
358 sub {
359 my ($pid, $code) = shift;
360 my $data = pop;
361 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
362 ($iter++) . "/$count]" .
363 ($code ? " $code" : '') .
364 " $$data\n";
365 }
366 );
367
368 my $t;
369 print "Reading data ...\n";
370
371 # Input is a directory
372 if (-d $input) {
373 my $it = Directory::Iterator->new($input);
374 my @dirs;
375 my $dir;
376
377 while (1) {
378 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
379 push @dirs, $dir;
380 $it->prune;
381 };
382 last unless $it->next;
383 };
384
385 print "Start processing ...\n";
386 $t = Benchmark->new;
387 $count = scalar @dirs;
388
389 DIRECTORY_LOOP:
390 for (my $i = 0; $i < $count; $i++) {
391
392 unless ($overwrite) {
393 my $filename = catfile(
394 $output,
395 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
396 );
397
398 if (-e $filename) {
399 $iter++;
400 print "Skip $filename\n";
401 next;
402 };
403 };
404
405 # Get the next fork
406 my $pid = $pool->start and next DIRECTORY_LOOP;
407 my $msg;
408
409 $msg = write_file($dirs[$i]);
410 $pool->finish(0, \$msg);
411 };
412 }
413
414 # Input is a file
415 elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
416 unless ($archive->test_unzip) {
417 print "Unzip is not installed or incompatible.\n\n";
418 exit(1);
419 };
420
421 unless ($archive->test) {
422 print "Zip archive not compatible.\n\n";
423 exit(1);
424 };
425
426 print "Start processing ...\n";
427 $t = Benchmark->new;
428 my @dirs = $archive->list_texts;
429 $count = scalar @dirs;
430
431 ARCHIVE_LOOP:
432 for (my $i = 0; $i < $count; $i++) {
433
434 # Split path information
435 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
436
437 unless ($overwrite) {
438 my $filename = catfile(
439 $output,
440 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
441 );
442
443 if (-e $filename) {
444 $iter++;
445 print "Skip $filename\n";
446 next;
447 };
448 };
449
450 # Get the next fork
451 my $pid = $pool->start and next ARCHIVE_LOOP;
452
453 # Create temporary file
454 my $temp = File::Temp->newdir;
455
456 my $msg;
457
458 # Extract from archive
459 if ($archive->extract($dirs[$i], $temp)) {
460
461 # Create corpus directory
462 $input = catdir("$temp", $corpus);
463
464 # Temporary directory
465 my $dir = catdir($input, $doc, $text);
466
467 # Write file
468 $msg = write_file($dir);
469
470 $temp = undef;
471 $pool->finish(0, \$msg);
472 }
473 else {
474
475 $temp = undef;
476 $msg = "Unable to extract " . $dirs[$i] . "\n";
477 $pool->finish(1, \$msg);
478 };
479 };
480 }
481
482 else {
483 print "Input is neither a directory nor an archive.\n\n";
484 };
485
486 $pool->wait_all_children;
487
488 print "Done.\n";
489 print timestr(timediff(Benchmark->new, $t))."\n\n";
490}
491
492# Unknown command
493else {
494 warn "Unknown command '$cmd'.\n\n";
495 pod2usage(%ERROR_HASH);
496}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000497
498__END__
Akron941c1a62016-02-23 17:41:41 +0100499
500=pod
501
502=encoding utf8
503
504=head1 NAME
505
506korapxml2krill - Merge KorapXML data and create Krill friendly documents
507
508
509=head1 SYNOPSIS
510
Akronc13a1702016-03-15 19:33:14 +0100511 $ korapxml2krill -z --input <directory> --output <filename>
512 $ korapxml2krill archive -z --input <directory> --output <directory>
513 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100514
515
516=head1 DESCRIPTION
517
518L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
519compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronc13a1702016-03-15 19:33:14 +0100520THe C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100521
522
523=head1 INSTALLATION
524
525The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
526
527 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
528
Akronc13a1702016-03-15 19:33:14 +0100529In case everything went well, the C<korapxml2krill> tool will
530be available on your command line.
Akron941c1a62016-02-23 17:41:41 +0100531
532
533=head1 ARGUMENTS
534
535=over 2
536
537=item B<archive>
538
Akrone10ad322016-02-27 10:54:26 +0100539Process an archive as a Zip-file or a folder of KorAP-XML documents.
540
541=item B<extract>
542
543Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100544
545=back
546
547
548=head1 OPTIONS
549
550=over 2
551
552=item B<--input|-i> <directory|file>
553
554Directory or archive file of documents to index.
555
556=item B<--output|-o> <directory|file>
557
558Output folder for archive processing or
559document name for single output (optional),
Akronc13a1702016-03-15 19:33:14 +0100560writes to C<STDOUT> by default.
Akron941c1a62016-02-23 17:41:41 +0100561
562=item B<--overwrite|-w>
563
564Overwrite files that already exist.
565
566=item B<--token|-t> <foundry>[#<file>]
567
568Define the default tokenization by specifying
569the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100570of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100571
572=item B<--skip|-s> <foundry>[#<layer>]
573
574Skip specific foundries by specifying the name
575or specific layers by defining the name
576with a # in front of the foundry,
Akronc13a1702016-03-15 19:33:14 +0100577e.g. Mate#Morpho. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100578Can be set multiple times.
579
Akronc13a1702016-03-15 19:33:14 +0100580=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100581
Akronc13a1702016-03-15 19:33:14 +0100582Allow specific annotion foundries and layers by defining them
583combining the foundry name with a C<#> and the layer name.
Akron941c1a62016-02-23 17:41:41 +0100584
585=item B<--primary|-p>
586
Akronc13a1702016-03-15 19:33:14 +0100587Output primary data or not. Defaults to C<true>.
Akron941c1a62016-02-23 17:41:41 +0100588Can be flagged using --no-primary as well.
Akronc13a1702016-03-15 19:33:14 +0100589This is deprecated.
Akron941c1a62016-02-23 17:41:41 +0100590
591=item B<--jobs|-j>
592
593Define the number of concurrent jobs in seperated forks
Akronc13a1702016-03-15 19:33:14 +0100594for archive processing, defaults to C<0>.
595This is experimental!
Akron941c1a62016-02-23 17:41:41 +0100596
597=item B<--human|-m>
598
Akronc13a1702016-03-15 19:33:14 +0100599Represent the data in an alternative human readible format.
600This is deprecated.
Akron941c1a62016-02-23 17:41:41 +0100601
602=item B<--pretty|-y>
603
Akronc13a1702016-03-15 19:33:14 +0100604Pretty print JSON output. Defaults to C<false>.
Akron941c1a62016-02-23 17:41:41 +0100605
606=item B<--gzip|-z>
607
608Compress the output (expects a defined output file in single processing).
609
Akrone10ad322016-02-27 10:54:26 +0100610=item B<--sigle|-sg>
611
612Extract the given text sigles.
613Currently only supported on C<extract>.
614Can be set multiple times.
615
Akron941c1a62016-02-23 17:41:41 +0100616=item B<--log|-l>
617
618The L<Log4perl> log level, defaults to C<ERROR>.
619
620=item B<--help|-h>
621
622Print this document.
623
624=item B<--version|-v>
625
626Print version information.
627
628=back
629
Akronc13a1702016-03-15 19:33:14 +0100630=head1 ANNOTATION SUPPORT
631
632L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
633developed in the KorAP project that are part of the KorAP preprocessing pipeline.
634The base foundry with paragraphs, sentences, and the text element are mandatory for
635L<Krill|https://github.com/KorAP/Krill>.
636
637=over2
638
639=item B<Base>
640
641=over 4
642
643=item Paragraphs
644
645=item Sentences
646
647=back
648
649=item B<Connexor>
650
651=over 4
652
653=item Morpho
654
655=item Phrase
656
657=item Sentences
658
659=item Syntax
660
661=back
662
663=item B<CoreNLP>
664
665=over 4
666
667=item Constituency
668
669=item Morpho
670
671=item NamedEntities
672
673=item Sentences
674
675=back
676
677=item B<DeReKo>
678
679=over 4
680
681=item Structure
682
683=back
684
685=item B<Glemm>
686
687=over 4
688
689=item Morpho
690
691=back
692
693=item B<Mate>
694
695=over 4
696
697=item Dependency
698
699=item Morpho
700
701=back
702
703=item B<OpenNLP>
704
705=over 4
706
707=item Morpho
708
709=item Sentences
710
711=back
712
713=item B<Sgbr>
714
715=over 4
716
717=item Lemma
718
719=item Morpho
720
721=back
722
723=item B<TreeTagger>
724
725=over 4
726
727=item Morpho
728
729=item Sentences
730
731=back
732
733=item B<XIP>
734
735=over 4
736
737=item Constituency
738
739=item Morpho
740
741=item Sentences
742
743=back
744
745=back
746
747More importers are in preparation.
748New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
749See the built-in annotation importers as examples.
750
Akron941c1a62016-02-23 17:41:41 +0100751=head1 AVAILABILITY
752
753 https://github.com/KorAP/KorAP-XML-Krill
754
755
756=head1 COPYRIGHT AND LICENSE
757
758Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
759Author: L<Nils Diewald|http://nils-diewald.de/>
760
761L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
762Corpus Analysis Platform at the
763L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
764member of the
765L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
766
767This program is free software published under the
768L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
769
770=cut