blob: af1da02bec8daef2a6de1591171f94366a89c183 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010019use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010020use v5.10;
21use Sys::Info;
22use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020023use File::Glob ':bsd_glob';
Akronc11f7982017-02-21 21:20:14 +010024
25# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010026# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010027# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010028
Akronc11f7982017-02-21 21:20:14 +010029# TODO: Use KorAP::XML::ForkPool!
30
Akron941c1a62016-02-23 17:41:41 +010031# CHANGES:
32# ----------------------------------------------------------
33# 2013/11/25
34# - Initial release
35#
36# 2014/10/29
37# - Merges foundry data to create indexer friendly documents
38#
Akron93d620e2016-02-05 19:40:05 +010039# 2016/02/04
40# - renamed to korapxml2krill
41# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010042#
43# 2016/02/12
44# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010045# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010046#
47# 2016/02/14
48# - Added version information
Akron941c1a62016-02-23 17:41:41 +010049# - Added support for archive files
50#
51# 2016/02/15
52# - Fixed temporary directory bug
53# - Improved skipping before unzipping
54# - Added EXPERIMENTAL concurrency support
55#
56# 2016/02/23
57# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010058#
59# 2016/02/27
60# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010061#
62# 2016/03/17
63# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010064#
65# 2016/03/18
66# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020067#
Akronf3f0c942016-06-27 13:27:14 +020068# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020069# - Added multi archive support
70# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020071# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020072#
73# 2016/07/06
74# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020075#
76# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020077# - Fixed temporary path issue in script
78#
79# 2016/10/24
80# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020081#
Akronb4bbec72016-10-26 20:21:02 +020082# 2016/10/24
83# - Added support for document extraction
84#
Akron3741f8b2016-12-21 19:55:21 +010085# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020086# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020087#
Akron3741f8b2016-12-21 19:55:21 +010088# 2016/12/21
89# - added support for base-sentences and base-tokenizations
90#
Akron4fa37c32017-01-20 14:43:10 +010091# 2017/01/20
92# - added support for DRuKoLa annotations
93#
Akron41ac10b2017-02-08 22:47:25 +010094# 2017/02/08
95# - added support for pagebreak annotations
96#
Akron821db3d2017-04-06 21:19:31 +020097# 2017/04/06
98# - added support for wildcards in input
99#
Akron941c1a62016-02-23 17:41:41 +0100100# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100101
Akron821db3d2017-04-06 21:19:31 +0200102our $LAST_CHANGE = '2017/04/06';
Akron941c1a62016-02-23 17:41:41 +0100103our $LOCAL = $FindBin::Bin;
104our $VERSION_MSG = <<"VERSION";
105Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
106VERSION
107
Akron941c1a62016-02-23 17:41:41 +0100108# Parse comand
109my $cmd;
110our @ARGV;
111if ($ARGV[0] && index($ARGV[0], '-') != 0) {
112 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100113};
Akron93d620e2016-02-05 19:40:05 +0100114
Akron5f51d422016-08-16 16:26:43 +0200115my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100116my $text;
Akrone10ad322016-02-27 10:54:26 +0100117
Akron941c1a62016-02-23 17:41:41 +0100118# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000119GetOptions(
Akron08385f62016-03-22 20:37:04 +0100120 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100121 'output|o=s' => \(my $output),
122 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100123 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200124 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron3741f8b2016-12-21 19:55:21 +0100125 'base-sentences|bs=s' => \(my $base_sentences = ''),
126 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
Akron41ac10b2017-02-08 22:47:25 +0100127 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
Akron941c1a62016-02-23 17:41:41 +0100128 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100129 'skip|s=s' => \@skip,
130 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100131 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100132 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200133 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100134 'primary|p!' => \(my $primary),
135 'pretty|y' => \(my $pretty),
136 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200137 'cache-size|cs=s' => \(my $cache_size = '50m'),
138 'cache-delete|cd!' => \(my $cache_delete = 1),
139 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100140 'help|h' => sub {
141 pod2usage(
142 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200143 -verbose => 99,
144 -msg => $VERSION_MSG,
145 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100146 );
147 },
148 'version|v' => sub {
149 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200150 -verbose => 0,
151 -msg => $VERSION_MSG,
152 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100153 )
154 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000155);
156
Akron821db3d2017-04-06 21:19:31 +0200157$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100158$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100159$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100160
Akron941c1a62016-02-23 17:41:41 +0100161my %ERROR_HASH = (
162 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200163 -verbose => 99,
164 -msg => $VERSION_MSG,
165 -output => '-',
166 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100167);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000168
Akron941c1a62016-02-23 17:41:41 +0100169# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100170pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000171
Akrone1dbc382016-07-08 22:24:52 +0200172# Gzip has no effect, if no output is given
173pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000174
Akron941c1a62016-02-23 17:41:41 +0100175# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000176Log::Log4perl->init({
177 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
178 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
179 'log4perl.appender.STDERR.layout' => 'PatternLayout',
180 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
181});
182
183my $log = Log::Log4perl->get_logger('main');
184
Akronc11f7982017-02-21 21:20:14 +0100185
186if ($jobs == -1) {
187 state $cores = Sys::Info->new->device('CPU')->count;
188 $jobs = ceil(5 * $cores);
189 $log->info("Run using $jobs jobs");
190};
191
Akron821db3d2017-04-06 21:19:31 +0200192
Akrone1dbc382016-07-08 22:24:52 +0200193my %skip;
194$skip{lc($_)} = 1 foreach @skip;
195
196my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100197push(@layers, ['Base', 'Sentences']) unless $base_sentences;
198push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200199
200# Connexor
201push(@layers, ['Connexor', 'Morpho']);
202push(@layers, ['Connexor', 'Syntax']);
203push(@layers, ['Connexor', 'Phrase']);
204push(@layers, ['Connexor', 'Sentences']);
205
206# CoreNLP
207push(@layers, ['CoreNLP', 'NamedEntities']);
208push(@layers, ['CoreNLP', 'Sentences']);
209push(@layers, ['CoreNLP', 'Morpho']);
210push(@layers, ['CoreNLP', 'Constituency']);
211
Akron3741f8b2016-12-21 19:55:21 +0100212
Akrone1dbc382016-07-08 22:24:52 +0200213# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100214my @dereko_attr = ();
215if ($base_sentences eq 'dereko#structure') {
216 push @dereko_attr, 'sentences';
217};
218if ($base_paragraphs eq 'dereko#structure') {
219 push @dereko_attr, 'paragraphs';
220};
Akron636bd9c2017-02-09 17:13:00 +0100221
Akron41ac10b2017-02-08 22:47:25 +0100222if ($base_pagebreaks eq 'dereko#structure') {
223 push @dereko_attr, 'pagebreaks';
224};
225
226if ($dereko_attr[0]) {
227 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100228}
229else {
230 push(@layers, ['DeReKo', 'Structure']);
231};
Akrone1dbc382016-07-08 22:24:52 +0200232
233# Glemm
234push(@layers, ['Glemm', 'Morpho']);
235
236# Malt
237push(@layers, ['Malt', 'Dependency']);
238
239# MDParser
240push(@layers, ['MDParser', 'Dependency']);
241
242# Mate
243push(@layers, ['Mate', 'Morpho']);
244push(@layers, ['Mate', 'Dependency']);
245
246# OpenNLP
247push(@layers, ['OpenNLP', 'Morpho']);
248push(@layers, ['OpenNLP', 'Sentences']);
249
250# Schreibgebrauch
251push(@layers, ['Sgbr', 'Lemma']);
252push(@layers, ['Sgbr', 'Morpho']);
253
254# TreeTagger
255push(@layers, ['TreeTagger', 'Morpho']);
256push(@layers, ['TreeTagger', 'Sentences']);
257
258# XIP
259push(@layers, ['XIP', 'Morpho']);
260push(@layers, ['XIP', 'Constituency']);
261push(@layers, ['XIP', 'Sentences']);
262push(@layers, ['XIP', 'Dependency']);
263
Akron4fa37c32017-01-20 14:43:10 +0100264# DRuKoLa
265push(@layers, ['DRuKoLa', 'Morpho']);
266
Akron3bd942f2017-02-20 20:09:14 +0100267# Marmot
268push(@layers, ['MarMoT', 'Morpho']);
269
Akron4fa37c32017-01-20 14:43:10 +0100270
Akrone1dbc382016-07-08 22:24:52 +0200271# Check filters
272my @filtered_anno;
273if ($skip{'#all'}) {
274 foreach (@anno) {
275 push @filtered_anno, [ split('#', $_) ];
276 };
277}
278
279# Add all annotations that are not skipped
280else {
281 # Add to index file - respect skipping
282 foreach my $info (@layers) {
283 # Skip if Foundry or Foundry#Layer should be skipped
284 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
285 push @filtered_anno, $info;
286 };
287 };
288};
289
290# Get tokenization basis
291my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
292
293# TODO: This should not be initialized for batch
294my $cache = Cache::FastMmap->new(
295 share_file => $cache_file,
296 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200297 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200298);
299
Akron03b24db2016-08-16 20:54:32 +0200300# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200301my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200302 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200303 meta_type => $meta,
304 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200305 foundry => $token_base_foundry,
306 layer => $token_base_layer,
307 gzip => $gzip,
308 log => $log,
309 primary => $primary,
310 pretty => $pretty,
311 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200312);
313
Akron941c1a62016-02-23 17:41:41 +0100314
315# Get file name based on path information
316sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100317 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200318 if (-d $i) {
319 $i =~ s![^\/]+$!!;
320 };
Akron941c1a62016-02-23 17:41:41 +0100321 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200322
323 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200324 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100325 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100326 $file =~ tr/\//-/;
327 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200328 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100329 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000330};
331
Akrone10ad322016-02-27 10:54:26 +0100332# Convert sigle to path construct
333s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
334
Akron7d4cdd82016-08-17 21:39:45 +0200335if ($cmd) {
336 if ($output && (!-e $output || !-d $output)) {
337 print "Directory '$output' does not exist.\n\n";
338 exit(0);
339 };
340};
341
342
Akron821db3d2017-04-06 21:19:31 +0200343# Glob files
344if (@input) {
345 my @new_input = ();
346
347 # Iterate over all inputs
348 foreach (@input) {
349 push (@new_input, bsd_glob($_));
350 };
351
352 if (scalar(@new_input) > scalar(@input)) {
353 @input = sort { length($a) <=> length($b) } @new_input;
354 print 'Input rewritten to ' . join(',', @input);
355 };
356};
357
358
Akron941c1a62016-02-23 17:41:41 +0100359# Process a single file
360unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100361 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000362
Akron941c1a62016-02-23 17:41:41 +0100363 BEGIN {
364 $main::TIME = Benchmark->new;
365 $main::LAST_STOP = Benchmark->new;
366 };
367
368 sub stop_time {
369 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200370 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100371 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200372 timestr(timediff($new, $main::LAST_STOP)) .
373 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
374 );
Akron941c1a62016-02-23 17:41:41 +0100375 $main::LAST_STOP = $new;
376 };
377
378 # Create and parse new document
379 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100380
Akron7d4cdd82016-08-17 21:39:45 +0200381 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200382 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100383
Akron11c80302016-03-18 19:44:43 +0100384 # Delete cache file
385 unlink($cache_file) if $cache_delete;
386
Akron5f51d422016-08-16 16:26:43 +0200387 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000388}
Nils Diewald59094f22014-11-05 18:20:50 +0000389
Akrone10ad322016-02-27 10:54:26 +0100390# Extract XML files
391elsif ($cmd eq 'extract') {
392
Akron7d4cdd82016-08-17 21:39:45 +0200393 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200394 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100395
Akron7d4cdd82016-08-17 21:39:45 +0200396 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100397 unless ($archive->test_unzip) {
398 print "Unzip is not installed or incompatible.\n\n";
399 exit(1);
400 };
401
Akronb0c88db2016-06-29 16:33:18 +0200402 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200403 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200404
Akron651cb8d2016-08-16 21:44:49 +0200405 my $prefix = 1;
406
Akron03b24db2016-08-16 20:54:32 +0200407 # No sigles given
408 unless (@sigle) {
409
410 # Get files
411 foreach ($archive->list_texts) {
412
413 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200414 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200415
416 # TODO: Make this OS independent
417 push @sigle, join '/', $corpus, $doc, $text;
418 };
Akron20807582016-10-26 17:11:34 +0200419 }
420
421 # Check sigle for doc sigles
422 else {
423 my @new_sigle;
424
425 my $prefix_check = 0;
426
427 # Iterate over all sigle
428 foreach (@sigle) {
429
430 # Sigle is a doc sigle
431 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200432
Akron60a8caa2017-02-17 21:51:27 +0100433 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200434 # Check if a prefix is needed
435 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100436
437 if ($prefix = $archive->check_prefix) {
438 print " with prefix ...";
439 };
Akron20807582016-10-26 17:11:34 +0200440 $prefix_check = 1;
441 };
442
Akron60a8caa2017-02-17 21:51:27 +0100443 print "\n";
444
Akron20807582016-10-26 17:11:34 +0200445 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200446 my $path = ($prefix ? './' : '') . $_;
447
448 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200449 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200450 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200451 ) ? '' : 'not '
452 );
453 print "extracted.\n";
454 }
Akron60a8caa2017-02-17 21:51:27 +0100455
456 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200457 else {
458 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100459
460 unless ($prefix_check) {
461
462 if ($prefix = $archive->check_prefix) {
463 print " with prefix ...";
464 };
465 $prefix_check = 1;
466 };
Akron20807582016-10-26 17:11:34 +0200467 };
468 };
469 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200470 };
471
Akrone10ad322016-02-27 10:54:26 +0100472 # Iterate over all given sigles and extract
473 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100474
Akron2812ba22016-10-28 21:55:59 +0200475 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200476
Akron03b24db2016-08-16 20:54:32 +0200477 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200478 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100479
Akron20807582016-10-26 17:11:34 +0200480 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200481 ($prefix ? './' : '') . $_, $output
482 ) ? '' : 'not '
483 );
Akrone10ad322016-02-27 10:54:26 +0100484 print "extracted.\n";
485 };
486
487 print "\n";
488 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200489 }
Akron7d4cdd82016-08-17 21:39:45 +0200490
491 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200492 else {
493 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100494 };
495}
496
Akron941c1a62016-02-23 17:41:41 +0100497# Process an archive
498elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000499
Akrone1dbc382016-07-08 22:24:52 +0200500 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100501
Akron7d4cdd82016-08-17 21:39:45 +0200502 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100503 my $pool = Parallel::ForkManager->new($jobs);
504
Akron7d4cdd82016-08-17 21:39:45 +0200505 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100506 my $iter = 1; # Current text in process
507
508 # Report on fork message
509 $pool->run_on_finish (
510 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200511 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100512 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200513
Akron08385f62016-03-22 20:37:04 +0100514 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200515 ($iter++) . "/$count]" .
516 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200517 ' ' . $data->[0] . "\n";
518 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100519 }
520 );
521
522 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200523 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100524 print "Reading data ...\n";
525
Akron7d4cdd82016-08-17 21:39:45 +0200526 # unless (Cache::FastMmap->new(
527 # share_file => $cache_file,
528 # cache_size => $cache_size,
529 # init_file => $cache_init
530 # )) {
531 # print "Unable to intialize cache '$cache_file'\n\n";
532 # exit(1);
533 # };
Akron11c80302016-03-18 19:44:43 +0100534
Akron941c1a62016-02-23 17:41:41 +0100535 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100536 if (-d $input[0]) {
537 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100538 my @dirs;
539 my $dir;
540
Akron7d4cdd82016-08-17 21:39:45 +0200541 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100542 while (1) {
543 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200544 push @dirs, $dir;
545 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100546 };
547 last unless $it->next;
548 };
549
550 print "Start processing ...\n";
551 $t = Benchmark->new;
552 $count = scalar @dirs;
553
554 DIRECTORY_LOOP:
555 for (my $i = 0; $i < $count; $i++) {
556
Akrone1dbc382016-07-08 22:24:52 +0200557 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200558 $output,
559 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200560 );
Akron941c1a62016-02-23 17:41:41 +0100561
562 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200563 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200564
Akron13d56622016-10-31 14:54:49 +0100565 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
566 $pool->finish(
567 0,
568 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
569 );
Akron3ec48972016-08-17 23:24:52 +0200570 }
571 else {
Akron4c0cf312016-10-15 16:42:09 +0200572 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200573 };
Akron941c1a62016-02-23 17:41:41 +0100574 };
575 }
576
577 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200578 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200579
Akron941c1a62016-02-23 17:41:41 +0100580 unless ($archive->test_unzip) {
581 print "Unzip is not installed or incompatible.\n\n";
582 exit(1);
583 };
584
Akron08385f62016-03-22 20:37:04 +0100585 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200586 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100587
Akron941c1a62016-02-23 17:41:41 +0100588 print "Start processing ...\n";
589 $t = Benchmark->new;
590 my @dirs = $archive->list_texts;
591 $count = scalar @dirs;
592
593 ARCHIVE_LOOP:
594 for (my $i = 0; $i < $count; $i++) {
595
596 # Split path information
597 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
598
Akrone1dbc382016-07-08 22:24:52 +0200599 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200600 $output,
601 get_file_name(
602 catfile($corpus, $doc, $text)
603 . '.json' . ($gzip ? '.gz' : '')
604 )
Akrone1dbc382016-07-08 22:24:52 +0200605 );
Akron941c1a62016-02-23 17:41:41 +0100606
607 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200608 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100609
Akron4c0cf312016-10-15 16:42:09 +0200610 # Create temporary file
611 $temp = File::Temp->newdir;
612
Akronbdf434a2016-10-24 17:42:07 +0200613 # TODO: Check if $filename exist at the beginning,
614 # because extraction can be horrible slow!
615
Akron941c1a62016-02-23 17:41:41 +0100616 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200617 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100618
Akron7d4cdd82016-08-17 21:39:45 +0200619 # Create corpus directory
620 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100621
Akron7d4cdd82016-08-17 21:39:45 +0200622 # Temporary directory
623 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100624
Akron7d4cdd82016-08-17 21:39:45 +0200625 # Write file
Akron13d56622016-10-31 14:54:49 +0100626 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200627 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100628 $pool->finish(
629 0,
630 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
631 );
632 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200633 }
634 else {
Akron4c0cf312016-10-15 16:42:09 +0200635 # Delete temporary file
636 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200637 };
Akron941c1a62016-02-23 17:41:41 +0100638 }
Akron7d4cdd82016-08-17 21:39:45 +0200639
640 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100641 else {
Akron4c0cf312016-10-15 16:42:09 +0200642 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100643 };
644 };
645 }
646
647 else {
648 print "Input is neither a directory nor an archive.\n\n";
649 };
650
651 $pool->wait_all_children;
652
Akron11c80302016-03-18 19:44:43 +0100653 # Delete cache file
654 unlink($cache_file) if $cache_delete;
655
Akron941c1a62016-02-23 17:41:41 +0100656 print "Done.\n";
657 print timestr(timediff(Benchmark->new, $t))."\n\n";
658}
659
660# Unknown command
661else {
662 warn "Unknown command '$cmd'.\n\n";
663 pod2usage(%ERROR_HASH);
664}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000665
666__END__
Akron941c1a62016-02-23 17:41:41 +0100667
668=pod
669
670=encoding utf8
671
672=head1 NAME
673
Akronf7ad89e2016-03-16 18:22:47 +0100674korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100675
676
677=head1 SYNOPSIS
678
Akrona76d8352016-10-27 16:27:32 +0200679 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100680
Akron2fd402b2016-10-27 21:26:48 +0200681
Akron941c1a62016-02-23 17:41:41 +0100682=head1 DESCRIPTION
683
684L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
685compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100686The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100687
688
689=head1 INSTALLATION
690
691The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
692
Akronaf386982016-10-12 00:33:25 +0200693 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100694
Akronc13a1702016-03-15 19:33:14 +0100695In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100696be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200697Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200698In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100699
700=head1 ARGUMENTS
701
Akrona76d8352016-10-27 16:27:32 +0200702 $ korapxml2krill -z --input <directory> --output <filename>
703
704Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200705It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200706
Akron941c1a62016-02-23 17:41:41 +0100707=over 2
708
709=item B<archive>
710
Akrona76d8352016-10-27 16:27:32 +0200711 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
712
Akron2fd402b2016-10-27 21:26:48 +0200713Converts an archive of KorAP-XML documents. It expects a directory
714(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100715
716=item B<extract>
717
Akrona76d8352016-10-27 16:27:32 +0200718 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
719
720Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100721
722=back
723
724
725=head1 OPTIONS
726
727=over 2
728
Akrona76d8352016-10-27 16:27:32 +0200729=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100730
Akrona76d8352016-10-27 16:27:32 +0200731Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100732
Akron7606afa2016-10-25 16:23:49 +0200733Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100734document, while C<archive> expects a KorAP-XML corpus folder or a zip
735file to batch process multiple files.
736C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200737
Akrona76d8352016-10-27 16:27:32 +0200738C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200739that the first archive listed contains all primary data files
740and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200741
Akron7606afa2016-10-25 16:23:49 +0200742 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200743
Akron821db3d2017-04-06 21:19:31 +0200744Input may also be defined using BSD glob wildcards.
745
746 -i 'file/news*.zip'
747
748The extended input array will be sorted in length order, so the shortest
749path needs to contain all primary data files and all meta data files.
750
Akron0c3e3752016-06-28 15:55:53 +0200751(The directory structure follows the base directory format,
752that may include a C<.> root folder.
753In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200754need to be passed with a hash sign in front of the archive's name.
755This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200756
Akron7606afa2016-10-25 16:23:49 +0200757To support zip files, a version of C<unzip> needs to be installed that is
758compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200759
Akron7606afa2016-10-25 16:23:49 +0200760B<The root folder switch using the hash sign is experimental and
761may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200762
Akron941c1a62016-02-23 17:41:41 +0100763=item B<--output|-o> <directory|file>
764
765Output folder for archive processing or
766document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100767writes to C<STDOUT> by default
768(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100769
770=item B<--overwrite|-w>
771
772Overwrite files that already exist.
773
Akron3741f8b2016-12-21 19:55:21 +0100774=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100775
776Define the default tokenization by specifying
777the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100778of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100779
Akron3741f8b2016-12-21 19:55:21 +0100780
781=item B<--base-sentences|-bs> <foundry>#<layer>
782
783Define the layer for base sentences.
784If given, this will be used instead of using C<Base#Sentences>.
785Currently C<DeReKo#Structure> is the only additional layer supported.
786
787 Defaults to unset.
788
789
790=item B<--base-paragraphs|-bp> <foundry>#<layer>
791
792Define the layer for base paragraphs.
793If given, this will be used instead of using C<Base#Paragraphs>.
794Currently C<DeReKo#Structure> is the only additional layer supported.
795
796 Defaults to unset.
797
798
Akron41ac10b2017-02-08 22:47:25 +0100799=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
800
801Define the layer for base pagebreaks.
802Currently C<DeReKo#Structure> is the only layer supported.
803
804 Defaults to unset.
805
806
Akron941c1a62016-02-23 17:41:41 +0100807=item B<--skip|-s> <foundry>[#<layer>]
808
Akronf7ad89e2016-03-16 18:22:47 +0100809Skip specific annotations by specifying the foundry
810(and optionally the layer with a C<#>-prefix),
811e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100812Can be set multiple times.
813
Akronc13a1702016-03-15 19:33:14 +0100814=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100815
Akronf7ad89e2016-03-16 18:22:47 +0100816Convert specific annotations by specifying the foundry
817(and optionally the layer with a C<#>-prefix),
818e.g. C<Mate> or C<Mate#Morpho>.
819Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100820
821=item B<--primary|-p>
822
Akronc13a1702016-03-15 19:33:14 +0100823Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100824Can be flagged using C<--no-primary> as well.
825This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100826
827=item B<--jobs|-j>
828
829Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100830for archive processing.
Akron11c80302016-03-18 19:44:43 +0100831Defaults to C<0> (everything runs in a single process).
Akronc11f7982017-02-21 21:20:14 +0100832Pass -1, and the value will be set automatically to 5
833times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +0100834This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100835
Akron35db6e32016-03-17 22:42:22 +0100836=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100837
Akron35db6e32016-03-17 22:42:22 +0100838Define the metadata parser to use. Defaults to C<I5>.
839Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
840This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100841
842=item B<--pretty|-y>
843
Akronc13a1702016-03-15 19:33:14 +0100844Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100845This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100846
847=item B<--gzip|-z>
848
Akronf7ad89e2016-03-16 18:22:47 +0100849Compress the output.
850Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100851
Akron11c80302016-03-18 19:44:43 +0100852=item B<--cache|-c>
853
854File to mmap a cache (using L<Cache::FastMmap>).
855Defaults to C<korapxml2krill.cache> in the calling directory.
856
857=item B<--cache-size|-cs>
858
859Size of the cache. Defaults to C<50m>.
860
861=item B<--cache-init|-ci>
862
863Initialize cache file.
864Can be flagged using C<--no-cache-init> as well.
865Defaults to C<true>.
866
867=item B<--cache-delete|-cd>
868
869Delete cache file after processing.
870Can be flagged using C<--no-cache-delete> as well.
871Defaults to C<true>.
872
Akrone10ad322016-02-27 10:54:26 +0100873=item B<--sigle|-sg>
874
Akron20807582016-10-26 17:11:34 +0200875Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100876Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100877I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200878Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200879In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200880On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100881
Akron941c1a62016-02-23 17:41:41 +0100882=item B<--log|-l>
883
884The L<Log4perl> log level, defaults to C<ERROR>.
885
886=item B<--help|-h>
887
888Print this document.
889
890=item B<--version|-v>
891
892Print version information.
893
894=back
895
Akronc13a1702016-03-15 19:33:14 +0100896=head1 ANNOTATION SUPPORT
897
898L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
899developed in the KorAP project that are part of the KorAP preprocessing pipeline.
900The base foundry with paragraphs, sentences, and the text element are mandatory for
901L<Krill|https://github.com/KorAP/Krill>.
902
Akron821db3d2017-04-06 21:19:31 +0200903 Base
904 #Paragraphs
905 #Sentences
Akronc13a1702016-03-15 19:33:14 +0100906
Akron821db3d2017-04-06 21:19:31 +0200907 Connexor
908 #Morpho
909 #Phrase
910 #Sentences
911 #Syntax
Akronc13a1702016-03-15 19:33:14 +0100912
Akron821db3d2017-04-06 21:19:31 +0200913 CoreNLP
914 #Constituency
915 #Morpho
916 #NamedEntities
917 #Sentences
Akronc13a1702016-03-15 19:33:14 +0100918
Akron821db3d2017-04-06 21:19:31 +0200919 DeReKo
920 #Structure
Akronc13a1702016-03-15 19:33:14 +0100921
Akron821db3d2017-04-06 21:19:31 +0200922 DRuKoLa
923 #Morpho
Akronc13a1702016-03-15 19:33:14 +0100924
Akron821db3d2017-04-06 21:19:31 +0200925 Glemm
926 #Morpho
Akronc13a1702016-03-15 19:33:14 +0100927
Akron821db3d2017-04-06 21:19:31 +0200928 Malt
929 #Dependency
Akronc13a1702016-03-15 19:33:14 +0100930
Akron821db3d2017-04-06 21:19:31 +0200931 MarMoT
932 #Morpho
Akronc13a1702016-03-15 19:33:14 +0100933
Akron821db3d2017-04-06 21:19:31 +0200934 Mate
935 #Dependency
936 #Morpho
Akronc13a1702016-03-15 19:33:14 +0100937
Akron821db3d2017-04-06 21:19:31 +0200938 MDParser
939 #Dependency
Akronc13a1702016-03-15 19:33:14 +0100940
Akron821db3d2017-04-06 21:19:31 +0200941 OpenNLP
942 #Morpho
943 #Sentences
Akronc13a1702016-03-15 19:33:14 +0100944
Akron821db3d2017-04-06 21:19:31 +0200945 Sgbr
946 #Lemma
947 #Morpho
Akronc13a1702016-03-15 19:33:14 +0100948
Akron821db3d2017-04-06 21:19:31 +0200949 TreeTagger
950 #Morpho
951 #Sentences
Akronc13a1702016-03-15 19:33:14 +0100952
Akron821db3d2017-04-06 21:19:31 +0200953 XIP
954 #Constituency
955 #Morpho
956 #Sentences
Akronc13a1702016-03-15 19:33:14 +0100957
Akronc13a1702016-03-15 19:33:14 +0100958
959More importers are in preparation.
960New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
961See the built-in annotation importers as examples.
962
Akron941c1a62016-02-23 17:41:41 +0100963=head1 AVAILABILITY
964
965 https://github.com/KorAP/KorAP-XML-Krill
966
967
968=head1 COPYRIGHT AND LICENSE
969
Akron3ec0a1c2017-01-18 14:41:55 +0100970Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100971
Akron941c1a62016-02-23 17:41:41 +0100972Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200973Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100974
975L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
976Corpus Analysis Platform at the
977L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
978member of the
979L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
980
981This program is free software published under the
982L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
983
984=cut