blob: b61ae04b3f41f203253a85cf57ac319daeb68d06 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron3741f8b2016-12-21 19:55:21 +010076# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020077# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020078#
Akron3741f8b2016-12-21 19:55:21 +010079# 2016/12/21
80# - added support for base-sentences and base-tokenizations
81#
Akron4fa37c32017-01-20 14:43:10 +010082# 2017/01/20
83# - added support for DRuKoLa annotations
84#
Akron41ac10b2017-02-08 22:47:25 +010085# 2017/02/08
86# - added support for pagebreak annotations
87#
Akron941c1a62016-02-23 17:41:41 +010088# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010089
Akron41ac10b2017-02-08 22:47:25 +010090our $LAST_CHANGE = '2017/02/08';
Akron941c1a62016-02-23 17:41:41 +010091our $LOCAL = $FindBin::Bin;
92our $VERSION_MSG = <<"VERSION";
93Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
94VERSION
95
Akron941c1a62016-02-23 17:41:41 +010096# Parse comand
97my $cmd;
98our @ARGV;
99if ($ARGV[0] && index($ARGV[0], '-') != 0) {
100 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100101};
Akron93d620e2016-02-05 19:40:05 +0100102
Akron5f51d422016-08-16 16:26:43 +0200103my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100104my $text;
Akrone10ad322016-02-27 10:54:26 +0100105
Akron941c1a62016-02-23 17:41:41 +0100106# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107GetOptions(
Akron08385f62016-03-22 20:37:04 +0100108 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100109 'output|o=s' => \(my $output),
110 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100111 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200112 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron3741f8b2016-12-21 19:55:21 +0100113 'base-sentences|bs=s' => \(my $base_sentences = ''),
114 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
Akron41ac10b2017-02-08 22:47:25 +0100115 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
Akron941c1a62016-02-23 17:41:41 +0100116 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100117 'skip|s=s' => \@skip,
118 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100119 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100120 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200121 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100122 'primary|p!' => \(my $primary),
123 'pretty|y' => \(my $pretty),
124 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200125 'cache-size|cs=s' => \(my $cache_size = '50m'),
126 'cache-delete|cd!' => \(my $cache_delete = 1),
127 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100128 'help|h' => sub {
129 pod2usage(
130 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200131 -verbose => 99,
132 -msg => $VERSION_MSG,
133 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100134 );
135 },
136 'version|v' => sub {
137 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200138 -verbose => 0,
139 -msg => $VERSION_MSG,
140 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100141 )
142 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143);
144
Akron3741f8b2016-12-21 19:55:21 +0100145$base_sentences = lc $base_sentences;
146$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100147$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100148
Akron941c1a62016-02-23 17:41:41 +0100149my %ERROR_HASH = (
150 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200151 -verbose => 99,
152 -msg => $VERSION_MSG,
153 -output => '-',
154 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100155);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000156
Akron941c1a62016-02-23 17:41:41 +0100157# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100158pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000159
Akrone1dbc382016-07-08 22:24:52 +0200160# Gzip has no effect, if no output is given
161pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000162
Akron941c1a62016-02-23 17:41:41 +0100163# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000164Log::Log4perl->init({
165 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
166 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
167 'log4perl.appender.STDERR.layout' => 'PatternLayout',
168 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
169});
170
171my $log = Log::Log4perl->get_logger('main');
172
Akrone1dbc382016-07-08 22:24:52 +0200173my %skip;
174$skip{lc($_)} = 1 foreach @skip;
175
176my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100177push(@layers, ['Base', 'Sentences']) unless $base_sentences;
178push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200179
180# Connexor
181push(@layers, ['Connexor', 'Morpho']);
182push(@layers, ['Connexor', 'Syntax']);
183push(@layers, ['Connexor', 'Phrase']);
184push(@layers, ['Connexor', 'Sentences']);
185
186# CoreNLP
187push(@layers, ['CoreNLP', 'NamedEntities']);
188push(@layers, ['CoreNLP', 'Sentences']);
189push(@layers, ['CoreNLP', 'Morpho']);
190push(@layers, ['CoreNLP', 'Constituency']);
191
Akron3741f8b2016-12-21 19:55:21 +0100192
Akrone1dbc382016-07-08 22:24:52 +0200193# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100194my @dereko_attr = ();
195if ($base_sentences eq 'dereko#structure') {
196 push @dereko_attr, 'sentences';
197};
198if ($base_paragraphs eq 'dereko#structure') {
199 push @dereko_attr, 'paragraphs';
200};
Akron636bd9c2017-02-09 17:13:00 +0100201
Akron41ac10b2017-02-08 22:47:25 +0100202if ($base_pagebreaks eq 'dereko#structure') {
203 push @dereko_attr, 'pagebreaks';
204};
205
206if ($dereko_attr[0]) {
207 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100208}
209else {
210 push(@layers, ['DeReKo', 'Structure']);
211};
Akrone1dbc382016-07-08 22:24:52 +0200212
213# Glemm
214push(@layers, ['Glemm', 'Morpho']);
215
216# Malt
217push(@layers, ['Malt', 'Dependency']);
218
219# MDParser
220push(@layers, ['MDParser', 'Dependency']);
221
222# Mate
223push(@layers, ['Mate', 'Morpho']);
224push(@layers, ['Mate', 'Dependency']);
225
226# OpenNLP
227push(@layers, ['OpenNLP', 'Morpho']);
228push(@layers, ['OpenNLP', 'Sentences']);
229
230# Schreibgebrauch
231push(@layers, ['Sgbr', 'Lemma']);
232push(@layers, ['Sgbr', 'Morpho']);
233
234# TreeTagger
235push(@layers, ['TreeTagger', 'Morpho']);
236push(@layers, ['TreeTagger', 'Sentences']);
237
238# XIP
239push(@layers, ['XIP', 'Morpho']);
240push(@layers, ['XIP', 'Constituency']);
241push(@layers, ['XIP', 'Sentences']);
242push(@layers, ['XIP', 'Dependency']);
243
Akron4fa37c32017-01-20 14:43:10 +0100244# DRuKoLa
245push(@layers, ['DRuKoLa', 'Morpho']);
246
Akron3bd942f2017-02-20 20:09:14 +0100247# Marmot
248push(@layers, ['MarMoT', 'Morpho']);
249
Akron4fa37c32017-01-20 14:43:10 +0100250
Akrone1dbc382016-07-08 22:24:52 +0200251# Check filters
252my @filtered_anno;
253if ($skip{'#all'}) {
254 foreach (@anno) {
255 push @filtered_anno, [ split('#', $_) ];
256 };
257}
258
259# Add all annotations that are not skipped
260else {
261 # Add to index file - respect skipping
262 foreach my $info (@layers) {
263 # Skip if Foundry or Foundry#Layer should be skipped
264 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
265 push @filtered_anno, $info;
266 };
267 };
268};
269
270# Get tokenization basis
271my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
272
273# TODO: This should not be initialized for batch
274my $cache = Cache::FastMmap->new(
275 share_file => $cache_file,
276 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200277 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200278);
279
Akron03b24db2016-08-16 20:54:32 +0200280# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200281my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200282 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200283 meta_type => $meta,
284 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200285 foundry => $token_base_foundry,
286 layer => $token_base_layer,
287 gzip => $gzip,
288 log => $log,
289 primary => $primary,
290 pretty => $pretty,
291 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200292);
293
Akron941c1a62016-02-23 17:41:41 +0100294
295# Get file name based on path information
296sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100297 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200298 if (-d $i) {
299 $i =~ s![^\/]+$!!;
300 };
Akron941c1a62016-02-23 17:41:41 +0100301 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200302
303 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200304 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100305 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100306 $file =~ tr/\//-/;
307 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200308 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100309 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000310};
311
Akrone10ad322016-02-27 10:54:26 +0100312# Convert sigle to path construct
313s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
314
Akron7d4cdd82016-08-17 21:39:45 +0200315if ($cmd) {
316 if ($output && (!-e $output || !-d $output)) {
317 print "Directory '$output' does not exist.\n\n";
318 exit(0);
319 };
320};
321
322
Akron941c1a62016-02-23 17:41:41 +0100323# Process a single file
324unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100325 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000326
Akron941c1a62016-02-23 17:41:41 +0100327 BEGIN {
328 $main::TIME = Benchmark->new;
329 $main::LAST_STOP = Benchmark->new;
330 };
331
332 sub stop_time {
333 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200334 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100335 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200336 timestr(timediff($new, $main::LAST_STOP)) .
337 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
338 );
Akron941c1a62016-02-23 17:41:41 +0100339 $main::LAST_STOP = $new;
340 };
341
342 # Create and parse new document
343 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100344
Akron7d4cdd82016-08-17 21:39:45 +0200345 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200346 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100347
Akron11c80302016-03-18 19:44:43 +0100348 # Delete cache file
349 unlink($cache_file) if $cache_delete;
350
Akron5f51d422016-08-16 16:26:43 +0200351 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352}
Nils Diewald59094f22014-11-05 18:20:50 +0000353
Akrone10ad322016-02-27 10:54:26 +0100354# Extract XML files
355elsif ($cmd eq 'extract') {
356
Akron7d4cdd82016-08-17 21:39:45 +0200357 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200358 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100359
Akron7d4cdd82016-08-17 21:39:45 +0200360 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100361 unless ($archive->test_unzip) {
362 print "Unzip is not installed or incompatible.\n\n";
363 exit(1);
364 };
365
Akronb0c88db2016-06-29 16:33:18 +0200366 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200367 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200368
Akron651cb8d2016-08-16 21:44:49 +0200369 my $prefix = 1;
370
Akron03b24db2016-08-16 20:54:32 +0200371 # No sigles given
372 unless (@sigle) {
373
374 # Get files
375 foreach ($archive->list_texts) {
376
377 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200378 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200379
380 # TODO: Make this OS independent
381 push @sigle, join '/', $corpus, $doc, $text;
382 };
Akron20807582016-10-26 17:11:34 +0200383 }
384
385 # Check sigle for doc sigles
386 else {
387 my @new_sigle;
388
389 my $prefix_check = 0;
390
391 # Iterate over all sigle
392 foreach (@sigle) {
393
394 # Sigle is a doc sigle
395 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200396
Akron60a8caa2017-02-17 21:51:27 +0100397 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200398 # Check if a prefix is needed
399 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100400
401 if ($prefix = $archive->check_prefix) {
402 print " with prefix ...";
403 };
Akron20807582016-10-26 17:11:34 +0200404 $prefix_check = 1;
405 };
406
Akron60a8caa2017-02-17 21:51:27 +0100407 print "\n";
408
Akron20807582016-10-26 17:11:34 +0200409 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200410 my $path = ($prefix ? './' : '') . $_;
411
412 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200413 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200414 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200415 ) ? '' : 'not '
416 );
417 print "extracted.\n";
418 }
Akron60a8caa2017-02-17 21:51:27 +0100419
420 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200421 else {
422 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100423
424 unless ($prefix_check) {
425
426 if ($prefix = $archive->check_prefix) {
427 print " with prefix ...";
428 };
429 $prefix_check = 1;
430 };
Akron20807582016-10-26 17:11:34 +0200431 };
432 };
433 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200434 };
435
Akrone10ad322016-02-27 10:54:26 +0100436 # Iterate over all given sigles and extract
437 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100438
Akron2812ba22016-10-28 21:55:59 +0200439 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200440
Akron03b24db2016-08-16 20:54:32 +0200441 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200442 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100443
Akron20807582016-10-26 17:11:34 +0200444 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200445 ($prefix ? './' : '') . $_, $output
446 ) ? '' : 'not '
447 );
Akrone10ad322016-02-27 10:54:26 +0100448 print "extracted.\n";
449 };
450
451 print "\n";
452 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200453 }
Akron7d4cdd82016-08-17 21:39:45 +0200454
455 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200456 else {
457 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100458 };
459}
460
Akron941c1a62016-02-23 17:41:41 +0100461# Process an archive
462elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000463
Akrone1dbc382016-07-08 22:24:52 +0200464 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100465
Akron7d4cdd82016-08-17 21:39:45 +0200466 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100467 my $pool = Parallel::ForkManager->new($jobs);
468
Akron7d4cdd82016-08-17 21:39:45 +0200469 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100470 my $iter = 1; # Current text in process
471
472 # Report on fork message
473 $pool->run_on_finish (
474 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200475 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100476 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200477
Akron08385f62016-03-22 20:37:04 +0100478 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200479 ($iter++) . "/$count]" .
480 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200481 ' ' . $data->[0] . "\n";
482 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100483 }
484 );
485
486 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200487 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100488 print "Reading data ...\n";
489
Akron7d4cdd82016-08-17 21:39:45 +0200490 # unless (Cache::FastMmap->new(
491 # share_file => $cache_file,
492 # cache_size => $cache_size,
493 # init_file => $cache_init
494 # )) {
495 # print "Unable to intialize cache '$cache_file'\n\n";
496 # exit(1);
497 # };
Akron11c80302016-03-18 19:44:43 +0100498
Akron941c1a62016-02-23 17:41:41 +0100499 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100500 if (-d $input[0]) {
501 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100502 my @dirs;
503 my $dir;
504
Akron7d4cdd82016-08-17 21:39:45 +0200505 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100506 while (1) {
507 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200508 push @dirs, $dir;
509 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100510 };
511 last unless $it->next;
512 };
513
514 print "Start processing ...\n";
515 $t = Benchmark->new;
516 $count = scalar @dirs;
517
518 DIRECTORY_LOOP:
519 for (my $i = 0; $i < $count; $i++) {
520
Akrone1dbc382016-07-08 22:24:52 +0200521 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200522 $output,
523 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200524 );
Akron941c1a62016-02-23 17:41:41 +0100525
526 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200527 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200528
Akron13d56622016-10-31 14:54:49 +0100529 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
530 $pool->finish(
531 0,
532 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
533 );
Akron3ec48972016-08-17 23:24:52 +0200534 }
535 else {
Akron4c0cf312016-10-15 16:42:09 +0200536 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200537 };
Akron941c1a62016-02-23 17:41:41 +0100538 };
539 }
540
541 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200542 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200543
Akron941c1a62016-02-23 17:41:41 +0100544 unless ($archive->test_unzip) {
545 print "Unzip is not installed or incompatible.\n\n";
546 exit(1);
547 };
548
Akron08385f62016-03-22 20:37:04 +0100549 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200550 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100551
Akron941c1a62016-02-23 17:41:41 +0100552 print "Start processing ...\n";
553 $t = Benchmark->new;
554 my @dirs = $archive->list_texts;
555 $count = scalar @dirs;
556
557 ARCHIVE_LOOP:
558 for (my $i = 0; $i < $count; $i++) {
559
560 # Split path information
561 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
562
Akrone1dbc382016-07-08 22:24:52 +0200563 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200564 $output,
565 get_file_name(
566 catfile($corpus, $doc, $text)
567 . '.json' . ($gzip ? '.gz' : '')
568 )
Akrone1dbc382016-07-08 22:24:52 +0200569 );
Akron941c1a62016-02-23 17:41:41 +0100570
571 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200572 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100573
Akron4c0cf312016-10-15 16:42:09 +0200574 # Create temporary file
575 $temp = File::Temp->newdir;
576
Akronbdf434a2016-10-24 17:42:07 +0200577 # TODO: Check if $filename exist at the beginning,
578 # because extraction can be horrible slow!
579
Akron941c1a62016-02-23 17:41:41 +0100580 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200581 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100582
Akron7d4cdd82016-08-17 21:39:45 +0200583 # Create corpus directory
584 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100585
Akron7d4cdd82016-08-17 21:39:45 +0200586 # Temporary directory
587 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100588
Akron7d4cdd82016-08-17 21:39:45 +0200589 # Write file
Akron13d56622016-10-31 14:54:49 +0100590 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200591 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100592 $pool->finish(
593 0,
594 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
595 );
596 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200597 }
598 else {
Akron4c0cf312016-10-15 16:42:09 +0200599 # Delete temporary file
600 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200601 };
Akron941c1a62016-02-23 17:41:41 +0100602 }
Akron7d4cdd82016-08-17 21:39:45 +0200603
604 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100605 else {
Akron4c0cf312016-10-15 16:42:09 +0200606 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100607 };
608 };
609 }
610
611 else {
612 print "Input is neither a directory nor an archive.\n\n";
613 };
614
615 $pool->wait_all_children;
616
Akron11c80302016-03-18 19:44:43 +0100617 # Delete cache file
618 unlink($cache_file) if $cache_delete;
619
Akron941c1a62016-02-23 17:41:41 +0100620 print "Done.\n";
621 print timestr(timediff(Benchmark->new, $t))."\n\n";
622}
623
624# Unknown command
625else {
626 warn "Unknown command '$cmd'.\n\n";
627 pod2usage(%ERROR_HASH);
628}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000629
630__END__
Akron941c1a62016-02-23 17:41:41 +0100631
632=pod
633
634=encoding utf8
635
636=head1 NAME
637
Akronf7ad89e2016-03-16 18:22:47 +0100638korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100639
640
641=head1 SYNOPSIS
642
Akrona76d8352016-10-27 16:27:32 +0200643 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100644
Akron2fd402b2016-10-27 21:26:48 +0200645
Akron941c1a62016-02-23 17:41:41 +0100646=head1 DESCRIPTION
647
648L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
649compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100650The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100651
652
653=head1 INSTALLATION
654
655The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
656
Akronaf386982016-10-12 00:33:25 +0200657 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100658
Akronc13a1702016-03-15 19:33:14 +0100659In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100660be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200661Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200662In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100663
664=head1 ARGUMENTS
665
Akrona76d8352016-10-27 16:27:32 +0200666 $ korapxml2krill -z --input <directory> --output <filename>
667
668Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200669It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200670
Akron941c1a62016-02-23 17:41:41 +0100671=over 2
672
673=item B<archive>
674
Akrona76d8352016-10-27 16:27:32 +0200675 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
676
Akron2fd402b2016-10-27 21:26:48 +0200677Converts an archive of KorAP-XML documents. It expects a directory
678(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100679
680=item B<extract>
681
Akrona76d8352016-10-27 16:27:32 +0200682 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
683
684Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100685
686=back
687
688
689=head1 OPTIONS
690
691=over 2
692
Akrona76d8352016-10-27 16:27:32 +0200693=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100694
Akrona76d8352016-10-27 16:27:32 +0200695Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100696
Akron7606afa2016-10-25 16:23:49 +0200697Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100698document, while C<archive> expects a KorAP-XML corpus folder or a zip
699file to batch process multiple files.
700C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200701
Akrona76d8352016-10-27 16:27:32 +0200702C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200703that the first archive listed contains all primary data files
704and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200705
Akron7606afa2016-10-25 16:23:49 +0200706 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200707
Akron0c3e3752016-06-28 15:55:53 +0200708(The directory structure follows the base directory format,
709that may include a C<.> root folder.
710In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200711need to be passed with a hash sign in front of the archive's name.
712This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200713
Akron7606afa2016-10-25 16:23:49 +0200714To support zip files, a version of C<unzip> needs to be installed that is
715compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200716
Akron7606afa2016-10-25 16:23:49 +0200717B<The root folder switch using the hash sign is experimental and
718may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200719
Akron941c1a62016-02-23 17:41:41 +0100720=item B<--output|-o> <directory|file>
721
722Output folder for archive processing or
723document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100724writes to C<STDOUT> by default
725(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100726
727=item B<--overwrite|-w>
728
729Overwrite files that already exist.
730
Akron3741f8b2016-12-21 19:55:21 +0100731=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100732
733Define the default tokenization by specifying
734the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100735of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100736
Akron3741f8b2016-12-21 19:55:21 +0100737
738=item B<--base-sentences|-bs> <foundry>#<layer>
739
740Define the layer for base sentences.
741If given, this will be used instead of using C<Base#Sentences>.
742Currently C<DeReKo#Structure> is the only additional layer supported.
743
744 Defaults to unset.
745
746
747=item B<--base-paragraphs|-bp> <foundry>#<layer>
748
749Define the layer for base paragraphs.
750If given, this will be used instead of using C<Base#Paragraphs>.
751Currently C<DeReKo#Structure> is the only additional layer supported.
752
753 Defaults to unset.
754
755
Akron41ac10b2017-02-08 22:47:25 +0100756=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
757
758Define the layer for base pagebreaks.
759Currently C<DeReKo#Structure> is the only layer supported.
760
761 Defaults to unset.
762
763
Akron941c1a62016-02-23 17:41:41 +0100764=item B<--skip|-s> <foundry>[#<layer>]
765
Akronf7ad89e2016-03-16 18:22:47 +0100766Skip specific annotations by specifying the foundry
767(and optionally the layer with a C<#>-prefix),
768e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100769Can be set multiple times.
770
Akronc13a1702016-03-15 19:33:14 +0100771=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100772
Akronf7ad89e2016-03-16 18:22:47 +0100773Convert specific annotations by specifying the foundry
774(and optionally the layer with a C<#>-prefix),
775e.g. C<Mate> or C<Mate#Morpho>.
776Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100777
778=item B<--primary|-p>
779
Akronc13a1702016-03-15 19:33:14 +0100780Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100781Can be flagged using C<--no-primary> as well.
782This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100783
784=item B<--jobs|-j>
785
786Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100787for archive processing.
Akron11c80302016-03-18 19:44:43 +0100788Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100789This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100790
Akron35db6e32016-03-17 22:42:22 +0100791=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100792
Akron35db6e32016-03-17 22:42:22 +0100793Define the metadata parser to use. Defaults to C<I5>.
794Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
795This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100796
797=item B<--pretty|-y>
798
Akronc13a1702016-03-15 19:33:14 +0100799Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100800This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100801
802=item B<--gzip|-z>
803
Akronf7ad89e2016-03-16 18:22:47 +0100804Compress the output.
805Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100806
Akron11c80302016-03-18 19:44:43 +0100807=item B<--cache|-c>
808
809File to mmap a cache (using L<Cache::FastMmap>).
810Defaults to C<korapxml2krill.cache> in the calling directory.
811
812=item B<--cache-size|-cs>
813
814Size of the cache. Defaults to C<50m>.
815
816=item B<--cache-init|-ci>
817
818Initialize cache file.
819Can be flagged using C<--no-cache-init> as well.
820Defaults to C<true>.
821
822=item B<--cache-delete|-cd>
823
824Delete cache file after processing.
825Can be flagged using C<--no-cache-delete> as well.
826Defaults to C<true>.
827
Akrone10ad322016-02-27 10:54:26 +0100828=item B<--sigle|-sg>
829
Akron20807582016-10-26 17:11:34 +0200830Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100831Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100832I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200833Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200834In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200835On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100836
Akron941c1a62016-02-23 17:41:41 +0100837=item B<--log|-l>
838
839The L<Log4perl> log level, defaults to C<ERROR>.
840
841=item B<--help|-h>
842
843Print this document.
844
845=item B<--version|-v>
846
847Print version information.
848
849=back
850
Akronc13a1702016-03-15 19:33:14 +0100851=head1 ANNOTATION SUPPORT
852
853L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
854developed in the KorAP project that are part of the KorAP preprocessing pipeline.
855The base foundry with paragraphs, sentences, and the text element are mandatory for
856L<Krill|https://github.com/KorAP/Krill>.
857
Akronf7ad89e2016-03-16 18:22:47 +0100858=over 2
Akronc13a1702016-03-15 19:33:14 +0100859
860=item B<Base>
861
862=over 4
863
Akronf7ad89e2016-03-16 18:22:47 +0100864=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100865
Akronf7ad89e2016-03-16 18:22:47 +0100866=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100867
868=back
869
870=item B<Connexor>
871
872=over 4
873
Akronf7ad89e2016-03-16 18:22:47 +0100874=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100875
Akronf7ad89e2016-03-16 18:22:47 +0100876=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100877
Akronf7ad89e2016-03-16 18:22:47 +0100878=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100879
Akronf7ad89e2016-03-16 18:22:47 +0100880=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100881
882=back
883
884=item B<CoreNLP>
885
886=over 4
887
Akronf7ad89e2016-03-16 18:22:47 +0100888=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100889
Akronf7ad89e2016-03-16 18:22:47 +0100890=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100891
Akronf7ad89e2016-03-16 18:22:47 +0100892=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100893
Akronf7ad89e2016-03-16 18:22:47 +0100894=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100895
896=back
897
898=item B<DeReKo>
899
900=over 4
901
Akronf7ad89e2016-03-16 18:22:47 +0100902=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100903
904=back
905
906=item B<Glemm>
907
908=over 4
909
Akronf7ad89e2016-03-16 18:22:47 +0100910=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100911
912=back
913
914=item B<Mate>
915
916=over 4
917
Akronf7ad89e2016-03-16 18:22:47 +0100918=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100919
Akronf7ad89e2016-03-16 18:22:47 +0100920=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100921
922=back
923
924=item B<OpenNLP>
925
926=over 4
927
Akronf7ad89e2016-03-16 18:22:47 +0100928=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100929
Akronf7ad89e2016-03-16 18:22:47 +0100930=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100931
932=back
933
934=item B<Sgbr>
935
936=over 4
937
Akronf7ad89e2016-03-16 18:22:47 +0100938=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100939
Akronf7ad89e2016-03-16 18:22:47 +0100940=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100941
942=back
943
Akron4fa37c32017-01-20 14:43:10 +0100944=item B<DRuKoLa>
945
946=over 4
947
948=item #Morpho
949
950=back
951
Akronc13a1702016-03-15 19:33:14 +0100952=item B<TreeTagger>
953
954=over 4
955
Akronf7ad89e2016-03-16 18:22:47 +0100956=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100957
Akronf7ad89e2016-03-16 18:22:47 +0100958=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100959
960=back
961
962=item B<XIP>
963
964=over 4
965
Akronf7ad89e2016-03-16 18:22:47 +0100966=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100967
Akronf7ad89e2016-03-16 18:22:47 +0100968=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100969
Akronf7ad89e2016-03-16 18:22:47 +0100970=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100971
972=back
973
974=back
975
976More importers are in preparation.
977New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
978See the built-in annotation importers as examples.
979
Akron941c1a62016-02-23 17:41:41 +0100980=head1 AVAILABILITY
981
982 https://github.com/KorAP/KorAP-XML-Krill
983
984
985=head1 COPYRIGHT AND LICENSE
986
Akron3ec0a1c2017-01-18 14:41:55 +0100987Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100988
Akron941c1a62016-02-23 17:41:41 +0100989Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200990Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100991
992L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
993Corpus Analysis Platform at the
994L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
995member of the
996L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
997
998This program is free software published under the
999L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1000
1001=cut