blob: 6bdb961eb87eb9f8e3132dbaff66e8748270a4e5 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron3741f8b2016-12-21 19:55:21 +010076# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020077# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020078#
Akron3741f8b2016-12-21 19:55:21 +010079# 2016/12/21
80# - added support for base-sentences and base-tokenizations
81#
Akron4fa37c32017-01-20 14:43:10 +010082# 2017/01/20
83# - added support for DRuKoLa annotations
84#
Akron41ac10b2017-02-08 22:47:25 +010085# 2017/02/08
86# - added support for pagebreak annotations
87#
Akron941c1a62016-02-23 17:41:41 +010088# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010089
Akron41ac10b2017-02-08 22:47:25 +010090our $LAST_CHANGE = '2017/02/08';
Akron941c1a62016-02-23 17:41:41 +010091our $LOCAL = $FindBin::Bin;
92our $VERSION_MSG = <<"VERSION";
93Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
94VERSION
95
Akron941c1a62016-02-23 17:41:41 +010096# Parse comand
97my $cmd;
98our @ARGV;
99if ($ARGV[0] && index($ARGV[0], '-') != 0) {
100 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100101};
Akron93d620e2016-02-05 19:40:05 +0100102
Akron5f51d422016-08-16 16:26:43 +0200103my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100104my $text;
Akrone10ad322016-02-27 10:54:26 +0100105
Akron941c1a62016-02-23 17:41:41 +0100106# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107GetOptions(
Akron08385f62016-03-22 20:37:04 +0100108 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100109 'output|o=s' => \(my $output),
110 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100111 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200112 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron3741f8b2016-12-21 19:55:21 +0100113 'base-sentences|bs=s' => \(my $base_sentences = ''),
114 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
Akron41ac10b2017-02-08 22:47:25 +0100115 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
Akron941c1a62016-02-23 17:41:41 +0100116 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100117 'skip|s=s' => \@skip,
118 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100119 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100120 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200121 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100122 'primary|p!' => \(my $primary),
123 'pretty|y' => \(my $pretty),
124 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200125 'cache-size|cs=s' => \(my $cache_size = '50m'),
126 'cache-delete|cd!' => \(my $cache_delete = 1),
127 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100128 'help|h' => sub {
129 pod2usage(
130 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200131 -verbose => 99,
132 -msg => $VERSION_MSG,
133 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100134 );
135 },
136 'version|v' => sub {
137 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200138 -verbose => 0,
139 -msg => $VERSION_MSG,
140 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100141 )
142 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143);
144
Akron3741f8b2016-12-21 19:55:21 +0100145$base_sentences = lc $base_sentences;
146$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100147$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100148
Akron941c1a62016-02-23 17:41:41 +0100149my %ERROR_HASH = (
150 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200151 -verbose => 99,
152 -msg => $VERSION_MSG,
153 -output => '-',
154 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100155);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000156
Akron941c1a62016-02-23 17:41:41 +0100157# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100158pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000159
Akrone1dbc382016-07-08 22:24:52 +0200160# Gzip has no effect, if no output is given
161pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000162
Akron941c1a62016-02-23 17:41:41 +0100163# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000164Log::Log4perl->init({
165 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
166 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
167 'log4perl.appender.STDERR.layout' => 'PatternLayout',
168 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
169});
170
171my $log = Log::Log4perl->get_logger('main');
172
Akrone1dbc382016-07-08 22:24:52 +0200173my %skip;
174$skip{lc($_)} = 1 foreach @skip;
175
176my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100177push(@layers, ['Base', 'Sentences']) unless $base_sentences;
178push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200179
180# Connexor
181push(@layers, ['Connexor', 'Morpho']);
182push(@layers, ['Connexor', 'Syntax']);
183push(@layers, ['Connexor', 'Phrase']);
184push(@layers, ['Connexor', 'Sentences']);
185
186# CoreNLP
187push(@layers, ['CoreNLP', 'NamedEntities']);
188push(@layers, ['CoreNLP', 'Sentences']);
189push(@layers, ['CoreNLP', 'Morpho']);
190push(@layers, ['CoreNLP', 'Constituency']);
191
Akron3741f8b2016-12-21 19:55:21 +0100192
Akrone1dbc382016-07-08 22:24:52 +0200193# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100194my @dereko_attr = ();
195if ($base_sentences eq 'dereko#structure') {
196 push @dereko_attr, 'sentences';
197};
198if ($base_paragraphs eq 'dereko#structure') {
199 push @dereko_attr, 'paragraphs';
200};
Akron636bd9c2017-02-09 17:13:00 +0100201
Akron41ac10b2017-02-08 22:47:25 +0100202if ($base_pagebreaks eq 'dereko#structure') {
203 push @dereko_attr, 'pagebreaks';
204};
205
206if ($dereko_attr[0]) {
207 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100208}
209else {
210 push(@layers, ['DeReKo', 'Structure']);
211};
Akrone1dbc382016-07-08 22:24:52 +0200212
213# Glemm
214push(@layers, ['Glemm', 'Morpho']);
215
216# Malt
217push(@layers, ['Malt', 'Dependency']);
218
219# MDParser
220push(@layers, ['MDParser', 'Dependency']);
221
222# Mate
223push(@layers, ['Mate', 'Morpho']);
224push(@layers, ['Mate', 'Dependency']);
225
226# OpenNLP
227push(@layers, ['OpenNLP', 'Morpho']);
228push(@layers, ['OpenNLP', 'Sentences']);
229
230# Schreibgebrauch
231push(@layers, ['Sgbr', 'Lemma']);
232push(@layers, ['Sgbr', 'Morpho']);
233
234# TreeTagger
235push(@layers, ['TreeTagger', 'Morpho']);
236push(@layers, ['TreeTagger', 'Sentences']);
237
238# XIP
239push(@layers, ['XIP', 'Morpho']);
240push(@layers, ['XIP', 'Constituency']);
241push(@layers, ['XIP', 'Sentences']);
242push(@layers, ['XIP', 'Dependency']);
243
Akron4fa37c32017-01-20 14:43:10 +0100244# DRuKoLa
245push(@layers, ['DRuKoLa', 'Morpho']);
246
247
Akrone1dbc382016-07-08 22:24:52 +0200248# Check filters
249my @filtered_anno;
250if ($skip{'#all'}) {
251 foreach (@anno) {
252 push @filtered_anno, [ split('#', $_) ];
253 };
254}
255
256# Add all annotations that are not skipped
257else {
258 # Add to index file - respect skipping
259 foreach my $info (@layers) {
260 # Skip if Foundry or Foundry#Layer should be skipped
261 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
262 push @filtered_anno, $info;
263 };
264 };
265};
266
267# Get tokenization basis
268my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
269
270# TODO: This should not be initialized for batch
271my $cache = Cache::FastMmap->new(
272 share_file => $cache_file,
273 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200274 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200275);
276
Akron03b24db2016-08-16 20:54:32 +0200277# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200278my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200279 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200280 meta_type => $meta,
281 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200282 foundry => $token_base_foundry,
283 layer => $token_base_layer,
284 gzip => $gzip,
285 log => $log,
286 primary => $primary,
287 pretty => $pretty,
288 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200289);
290
Akron941c1a62016-02-23 17:41:41 +0100291
292# Get file name based on path information
293sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100294 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200295 if (-d $i) {
296 $i =~ s![^\/]+$!!;
297 };
Akron941c1a62016-02-23 17:41:41 +0100298 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200299
300 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200301 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100302 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100303 $file =~ tr/\//-/;
304 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200305 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100306 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000307};
308
Akrone10ad322016-02-27 10:54:26 +0100309# Convert sigle to path construct
310s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
311
Akron7d4cdd82016-08-17 21:39:45 +0200312if ($cmd) {
313 if ($output && (!-e $output || !-d $output)) {
314 print "Directory '$output' does not exist.\n\n";
315 exit(0);
316 };
317};
318
319
Akron941c1a62016-02-23 17:41:41 +0100320# Process a single file
321unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100322 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000323
Akron941c1a62016-02-23 17:41:41 +0100324 BEGIN {
325 $main::TIME = Benchmark->new;
326 $main::LAST_STOP = Benchmark->new;
327 };
328
329 sub stop_time {
330 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200331 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100332 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200333 timestr(timediff($new, $main::LAST_STOP)) .
334 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
335 );
Akron941c1a62016-02-23 17:41:41 +0100336 $main::LAST_STOP = $new;
337 };
338
339 # Create and parse new document
340 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100341
Akron7d4cdd82016-08-17 21:39:45 +0200342 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200343 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100344
Akron11c80302016-03-18 19:44:43 +0100345 # Delete cache file
346 unlink($cache_file) if $cache_delete;
347
Akron5f51d422016-08-16 16:26:43 +0200348 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349}
Nils Diewald59094f22014-11-05 18:20:50 +0000350
Akrone10ad322016-02-27 10:54:26 +0100351# Extract XML files
352elsif ($cmd eq 'extract') {
353
Akron7d4cdd82016-08-17 21:39:45 +0200354 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200355 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100356
Akron7d4cdd82016-08-17 21:39:45 +0200357 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100358 unless ($archive->test_unzip) {
359 print "Unzip is not installed or incompatible.\n\n";
360 exit(1);
361 };
362
Akronb0c88db2016-06-29 16:33:18 +0200363 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200364 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200365
Akron651cb8d2016-08-16 21:44:49 +0200366 my $prefix = 1;
367
Akron03b24db2016-08-16 20:54:32 +0200368 # No sigles given
369 unless (@sigle) {
370
371 # Get files
372 foreach ($archive->list_texts) {
373
374 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200375 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200376
377 # TODO: Make this OS independent
378 push @sigle, join '/', $corpus, $doc, $text;
379 };
Akron20807582016-10-26 17:11:34 +0200380 }
381
382 # Check sigle for doc sigles
383 else {
384 my @new_sigle;
385
386 my $prefix_check = 0;
387
388 # Iterate over all sigle
389 foreach (@sigle) {
390
391 # Sigle is a doc sigle
392 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200393
Akron60a8caa2017-02-17 21:51:27 +0100394 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200395 # Check if a prefix is needed
396 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100397
398 if ($prefix = $archive->check_prefix) {
399 print " with prefix ...";
400 };
Akron20807582016-10-26 17:11:34 +0200401 $prefix_check = 1;
402 };
403
Akron60a8caa2017-02-17 21:51:27 +0100404 print "\n";
405
Akron20807582016-10-26 17:11:34 +0200406 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200407 my $path = ($prefix ? './' : '') . $_;
408
409 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200410 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200411 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200412 ) ? '' : 'not '
413 );
414 print "extracted.\n";
415 }
Akron60a8caa2017-02-17 21:51:27 +0100416
417 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200418 else {
419 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100420
421 unless ($prefix_check) {
422
423 if ($prefix = $archive->check_prefix) {
424 print " with prefix ...";
425 };
426 $prefix_check = 1;
427 };
Akron20807582016-10-26 17:11:34 +0200428 };
429 };
430 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200431 };
432
Akrone10ad322016-02-27 10:54:26 +0100433 # Iterate over all given sigles and extract
434 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100435
Akron2812ba22016-10-28 21:55:59 +0200436 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200437
Akron03b24db2016-08-16 20:54:32 +0200438 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200439 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100440
Akron20807582016-10-26 17:11:34 +0200441 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200442 ($prefix ? './' : '') . $_, $output
443 ) ? '' : 'not '
444 );
Akrone10ad322016-02-27 10:54:26 +0100445 print "extracted.\n";
446 };
447
448 print "\n";
449 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200450 }
Akron7d4cdd82016-08-17 21:39:45 +0200451
452 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200453 else {
454 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100455 };
456}
457
Akron941c1a62016-02-23 17:41:41 +0100458# Process an archive
459elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000460
Akrone1dbc382016-07-08 22:24:52 +0200461 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100462
Akron7d4cdd82016-08-17 21:39:45 +0200463 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100464 my $pool = Parallel::ForkManager->new($jobs);
465
Akron7d4cdd82016-08-17 21:39:45 +0200466 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100467 my $iter = 1; # Current text in process
468
469 # Report on fork message
470 $pool->run_on_finish (
471 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200472 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100473 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200474
Akron08385f62016-03-22 20:37:04 +0100475 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200476 ($iter++) . "/$count]" .
477 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200478 ' ' . $data->[0] . "\n";
479 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100480 }
481 );
482
483 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200484 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100485 print "Reading data ...\n";
486
Akron7d4cdd82016-08-17 21:39:45 +0200487 # unless (Cache::FastMmap->new(
488 # share_file => $cache_file,
489 # cache_size => $cache_size,
490 # init_file => $cache_init
491 # )) {
492 # print "Unable to intialize cache '$cache_file'\n\n";
493 # exit(1);
494 # };
Akron11c80302016-03-18 19:44:43 +0100495
Akron941c1a62016-02-23 17:41:41 +0100496 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100497 if (-d $input[0]) {
498 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100499 my @dirs;
500 my $dir;
501
Akron7d4cdd82016-08-17 21:39:45 +0200502 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100503 while (1) {
504 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200505 push @dirs, $dir;
506 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100507 };
508 last unless $it->next;
509 };
510
511 print "Start processing ...\n";
512 $t = Benchmark->new;
513 $count = scalar @dirs;
514
515 DIRECTORY_LOOP:
516 for (my $i = 0; $i < $count; $i++) {
517
Akrone1dbc382016-07-08 22:24:52 +0200518 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200519 $output,
520 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200521 );
Akron941c1a62016-02-23 17:41:41 +0100522
523 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200524 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200525
Akron13d56622016-10-31 14:54:49 +0100526 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
527 $pool->finish(
528 0,
529 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
530 );
Akron3ec48972016-08-17 23:24:52 +0200531 }
532 else {
Akron4c0cf312016-10-15 16:42:09 +0200533 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200534 };
Akron941c1a62016-02-23 17:41:41 +0100535 };
536 }
537
538 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200539 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200540
Akron941c1a62016-02-23 17:41:41 +0100541 unless ($archive->test_unzip) {
542 print "Unzip is not installed or incompatible.\n\n";
543 exit(1);
544 };
545
Akron08385f62016-03-22 20:37:04 +0100546 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200547 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100548
Akron941c1a62016-02-23 17:41:41 +0100549 print "Start processing ...\n";
550 $t = Benchmark->new;
551 my @dirs = $archive->list_texts;
552 $count = scalar @dirs;
553
554 ARCHIVE_LOOP:
555 for (my $i = 0; $i < $count; $i++) {
556
557 # Split path information
558 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
559
Akrone1dbc382016-07-08 22:24:52 +0200560 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200561 $output,
562 get_file_name(
563 catfile($corpus, $doc, $text)
564 . '.json' . ($gzip ? '.gz' : '')
565 )
Akrone1dbc382016-07-08 22:24:52 +0200566 );
Akron941c1a62016-02-23 17:41:41 +0100567
568 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200569 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100570
Akron4c0cf312016-10-15 16:42:09 +0200571 # Create temporary file
572 $temp = File::Temp->newdir;
573
Akronbdf434a2016-10-24 17:42:07 +0200574 # TODO: Check if $filename exist at the beginning,
575 # because extraction can be horrible slow!
576
Akron941c1a62016-02-23 17:41:41 +0100577 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200578 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100579
Akron7d4cdd82016-08-17 21:39:45 +0200580 # Create corpus directory
581 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100582
Akron7d4cdd82016-08-17 21:39:45 +0200583 # Temporary directory
584 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100585
Akron7d4cdd82016-08-17 21:39:45 +0200586 # Write file
Akron13d56622016-10-31 14:54:49 +0100587 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200588 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100589 $pool->finish(
590 0,
591 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
592 );
593 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200594 }
595 else {
Akron4c0cf312016-10-15 16:42:09 +0200596 # Delete temporary file
597 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200598 };
Akron941c1a62016-02-23 17:41:41 +0100599 }
Akron7d4cdd82016-08-17 21:39:45 +0200600
601 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100602 else {
Akron4c0cf312016-10-15 16:42:09 +0200603 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100604 };
605 };
606 }
607
608 else {
609 print "Input is neither a directory nor an archive.\n\n";
610 };
611
612 $pool->wait_all_children;
613
Akron11c80302016-03-18 19:44:43 +0100614 # Delete cache file
615 unlink($cache_file) if $cache_delete;
616
Akron941c1a62016-02-23 17:41:41 +0100617 print "Done.\n";
618 print timestr(timediff(Benchmark->new, $t))."\n\n";
619}
620
621# Unknown command
622else {
623 warn "Unknown command '$cmd'.\n\n";
624 pod2usage(%ERROR_HASH);
625}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000626
627__END__
Akron941c1a62016-02-23 17:41:41 +0100628
629=pod
630
631=encoding utf8
632
633=head1 NAME
634
Akronf7ad89e2016-03-16 18:22:47 +0100635korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100636
637
638=head1 SYNOPSIS
639
Akrona76d8352016-10-27 16:27:32 +0200640 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100641
Akron2fd402b2016-10-27 21:26:48 +0200642
Akron941c1a62016-02-23 17:41:41 +0100643=head1 DESCRIPTION
644
645L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
646compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100647The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100648
649
650=head1 INSTALLATION
651
652The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
653
Akronaf386982016-10-12 00:33:25 +0200654 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100655
Akronc13a1702016-03-15 19:33:14 +0100656In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100657be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200658Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200659In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100660
661=head1 ARGUMENTS
662
Akrona76d8352016-10-27 16:27:32 +0200663 $ korapxml2krill -z --input <directory> --output <filename>
664
665Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200666It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200667
Akron941c1a62016-02-23 17:41:41 +0100668=over 2
669
670=item B<archive>
671
Akrona76d8352016-10-27 16:27:32 +0200672 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
673
Akron2fd402b2016-10-27 21:26:48 +0200674Converts an archive of KorAP-XML documents. It expects a directory
675(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100676
677=item B<extract>
678
Akrona76d8352016-10-27 16:27:32 +0200679 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
680
681Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100682
683=back
684
685
686=head1 OPTIONS
687
688=over 2
689
Akrona76d8352016-10-27 16:27:32 +0200690=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100691
Akrona76d8352016-10-27 16:27:32 +0200692Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100693
Akron7606afa2016-10-25 16:23:49 +0200694Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100695document, while C<archive> expects a KorAP-XML corpus folder or a zip
696file to batch process multiple files.
697C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200698
Akrona76d8352016-10-27 16:27:32 +0200699C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200700that the first archive listed contains all primary data files
701and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200702
Akron7606afa2016-10-25 16:23:49 +0200703 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200704
Akron0c3e3752016-06-28 15:55:53 +0200705(The directory structure follows the base directory format,
706that may include a C<.> root folder.
707In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200708need to be passed with a hash sign in front of the archive's name.
709This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200710
Akron7606afa2016-10-25 16:23:49 +0200711To support zip files, a version of C<unzip> needs to be installed that is
712compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200713
Akron7606afa2016-10-25 16:23:49 +0200714B<The root folder switch using the hash sign is experimental and
715may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200716
Akron941c1a62016-02-23 17:41:41 +0100717=item B<--output|-o> <directory|file>
718
719Output folder for archive processing or
720document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100721writes to C<STDOUT> by default
722(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100723
724=item B<--overwrite|-w>
725
726Overwrite files that already exist.
727
Akron3741f8b2016-12-21 19:55:21 +0100728=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100729
730Define the default tokenization by specifying
731the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100732of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100733
Akron3741f8b2016-12-21 19:55:21 +0100734
735=item B<--base-sentences|-bs> <foundry>#<layer>
736
737Define the layer for base sentences.
738If given, this will be used instead of using C<Base#Sentences>.
739Currently C<DeReKo#Structure> is the only additional layer supported.
740
741 Defaults to unset.
742
743
744=item B<--base-paragraphs|-bp> <foundry>#<layer>
745
746Define the layer for base paragraphs.
747If given, this will be used instead of using C<Base#Paragraphs>.
748Currently C<DeReKo#Structure> is the only additional layer supported.
749
750 Defaults to unset.
751
752
Akron41ac10b2017-02-08 22:47:25 +0100753=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
754
755Define the layer for base pagebreaks.
756Currently C<DeReKo#Structure> is the only layer supported.
757
758 Defaults to unset.
759
760
Akron941c1a62016-02-23 17:41:41 +0100761=item B<--skip|-s> <foundry>[#<layer>]
762
Akronf7ad89e2016-03-16 18:22:47 +0100763Skip specific annotations by specifying the foundry
764(and optionally the layer with a C<#>-prefix),
765e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100766Can be set multiple times.
767
Akronc13a1702016-03-15 19:33:14 +0100768=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100769
Akronf7ad89e2016-03-16 18:22:47 +0100770Convert specific annotations by specifying the foundry
771(and optionally the layer with a C<#>-prefix),
772e.g. C<Mate> or C<Mate#Morpho>.
773Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100774
775=item B<--primary|-p>
776
Akronc13a1702016-03-15 19:33:14 +0100777Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100778Can be flagged using C<--no-primary> as well.
779This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100780
781=item B<--jobs|-j>
782
783Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100784for archive processing.
Akron11c80302016-03-18 19:44:43 +0100785Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100786This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100787
Akron35db6e32016-03-17 22:42:22 +0100788=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100789
Akron35db6e32016-03-17 22:42:22 +0100790Define the metadata parser to use. Defaults to C<I5>.
791Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
792This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100793
794=item B<--pretty|-y>
795
Akronc13a1702016-03-15 19:33:14 +0100796Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100797This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100798
799=item B<--gzip|-z>
800
Akronf7ad89e2016-03-16 18:22:47 +0100801Compress the output.
802Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100803
Akron11c80302016-03-18 19:44:43 +0100804=item B<--cache|-c>
805
806File to mmap a cache (using L<Cache::FastMmap>).
807Defaults to C<korapxml2krill.cache> in the calling directory.
808
809=item B<--cache-size|-cs>
810
811Size of the cache. Defaults to C<50m>.
812
813=item B<--cache-init|-ci>
814
815Initialize cache file.
816Can be flagged using C<--no-cache-init> as well.
817Defaults to C<true>.
818
819=item B<--cache-delete|-cd>
820
821Delete cache file after processing.
822Can be flagged using C<--no-cache-delete> as well.
823Defaults to C<true>.
824
Akrone10ad322016-02-27 10:54:26 +0100825=item B<--sigle|-sg>
826
Akron20807582016-10-26 17:11:34 +0200827Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100828Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100829I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200830Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200831In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200832On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100833
Akron941c1a62016-02-23 17:41:41 +0100834=item B<--log|-l>
835
836The L<Log4perl> log level, defaults to C<ERROR>.
837
838=item B<--help|-h>
839
840Print this document.
841
842=item B<--version|-v>
843
844Print version information.
845
846=back
847
Akronc13a1702016-03-15 19:33:14 +0100848=head1 ANNOTATION SUPPORT
849
850L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
851developed in the KorAP project that are part of the KorAP preprocessing pipeline.
852The base foundry with paragraphs, sentences, and the text element are mandatory for
853L<Krill|https://github.com/KorAP/Krill>.
854
Akronf7ad89e2016-03-16 18:22:47 +0100855=over 2
Akronc13a1702016-03-15 19:33:14 +0100856
857=item B<Base>
858
859=over 4
860
Akronf7ad89e2016-03-16 18:22:47 +0100861=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100862
Akronf7ad89e2016-03-16 18:22:47 +0100863=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100864
865=back
866
867=item B<Connexor>
868
869=over 4
870
Akronf7ad89e2016-03-16 18:22:47 +0100871=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100872
Akronf7ad89e2016-03-16 18:22:47 +0100873=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100874
Akronf7ad89e2016-03-16 18:22:47 +0100875=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100876
Akronf7ad89e2016-03-16 18:22:47 +0100877=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100878
879=back
880
881=item B<CoreNLP>
882
883=over 4
884
Akronf7ad89e2016-03-16 18:22:47 +0100885=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100886
Akronf7ad89e2016-03-16 18:22:47 +0100887=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100888
Akronf7ad89e2016-03-16 18:22:47 +0100889=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100890
Akronf7ad89e2016-03-16 18:22:47 +0100891=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100892
893=back
894
895=item B<DeReKo>
896
897=over 4
898
Akronf7ad89e2016-03-16 18:22:47 +0100899=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100900
901=back
902
903=item B<Glemm>
904
905=over 4
906
Akronf7ad89e2016-03-16 18:22:47 +0100907=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100908
909=back
910
911=item B<Mate>
912
913=over 4
914
Akronf7ad89e2016-03-16 18:22:47 +0100915=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100916
Akronf7ad89e2016-03-16 18:22:47 +0100917=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100918
919=back
920
921=item B<OpenNLP>
922
923=over 4
924
Akronf7ad89e2016-03-16 18:22:47 +0100925=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100926
Akronf7ad89e2016-03-16 18:22:47 +0100927=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100928
929=back
930
931=item B<Sgbr>
932
933=over 4
934
Akronf7ad89e2016-03-16 18:22:47 +0100935=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100936
Akronf7ad89e2016-03-16 18:22:47 +0100937=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100938
939=back
940
Akron4fa37c32017-01-20 14:43:10 +0100941=item B<DRuKoLa>
942
943=over 4
944
945=item #Morpho
946
947=back
948
Akronc13a1702016-03-15 19:33:14 +0100949=item B<TreeTagger>
950
951=over 4
952
Akronf7ad89e2016-03-16 18:22:47 +0100953=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100954
Akronf7ad89e2016-03-16 18:22:47 +0100955=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100956
957=back
958
959=item B<XIP>
960
961=over 4
962
Akronf7ad89e2016-03-16 18:22:47 +0100963=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100964
Akronf7ad89e2016-03-16 18:22:47 +0100965=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100966
Akronf7ad89e2016-03-16 18:22:47 +0100967=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100968
969=back
970
971=back
972
973More importers are in preparation.
974New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
975See the built-in annotation importers as examples.
976
Akron941c1a62016-02-23 17:41:41 +0100977=head1 AVAILABILITY
978
979 https://github.com/KorAP/KorAP-XML-Krill
980
981
982=head1 COPYRIGHT AND LICENSE
983
Akron3ec0a1c2017-01-18 14:41:55 +0100984Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100985
Akron941c1a62016-02-23 17:41:41 +0100986Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200987Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100988
989L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
990Corpus Analysis Platform at the
991L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
992member of the
993L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
994
995This program is free software published under the
996L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
997
998=cut