blob: 669c0d32604ce39961d6da9445a50bdb38fd7501 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
136# 2019/02/07
137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron941c1a62016-02-23 17:41:41 +0100139# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100140
Akron263274c2019-02-07 09:48:30 +0100141our $LAST_CHANGE = '2019/02/07';
Akron941c1a62016-02-23 17:41:41 +0100142our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100143our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100144our $VERSION_MSG = <<"VERSION";
145Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
146VERSION
147
Akron63f20d42017-04-10 23:40:29 +0200148# Prototypes
149sub get_file_name_from_glob($);
150sub get_file_name($);
151
Akron941c1a62016-02-23 17:41:41 +0100152# Parse comand
153my $cmd;
154our @ARGV;
155if ($ARGV[0] && index($ARGV[0], '-') != 0) {
156 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100157};
Akron63f20d42017-04-10 23:40:29 +0200158my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100159
Akron5f51d422016-08-16 16:26:43 +0200160my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100161my $text;
Akrone10ad322016-02-27 10:54:26 +0100162
Akron941c1a62016-02-23 17:41:41 +0100163# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000164GetOptions(
Akron08385f62016-03-22 20:37:04 +0100165 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200166 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100167 'output|o=s' => \(my $output),
168 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100169 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200170 'token|t=s' => \(my $token_base),
171 'base-sentences|bs=s' => \(my $base_sentences),
172 'base-paragraphs|bp=s' => \(my $base_paragraphs),
173 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100174 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200175 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100176 'skip|s=s' => \@skip,
177 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200178 'cache|c=s' => \(my $cache_file),
179 'config|cfg=s' => \(my $cfg_file),
180 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200181 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100182 'primary|p!' => \(my $primary),
183 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200184 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100185 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200186 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100187 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200188 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200189 'cache-size|cs=s' => \(my $cache_size),
190 'cache-delete|cd!' => \(my $cache_delete),
191 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100192 'help|h' => sub {
193 pod2usage(
194 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200195 -verbose => 99,
196 -msg => $VERSION_MSG,
197 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100198 );
199 },
200 'version|v' => sub {
201 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200202 -verbose => 0,
203 -msg => $VERSION_MSG,
204 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100205 )
206 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000207);
208
Akron63f20d42017-04-10 23:40:29 +0200209
Akron636aa112017-04-07 18:48:56 +0200210# Load from configuration
211if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200212 my %config;
213
214 Config::Simple->import_from($cfg_file, \%config);
215
216 # Overwrite
217 if (!defined($overwrite) && defined $config{overwrite}) {
218 $overwrite = $config{overwrite};
219 };
220
221 # Gzip
222 if (!defined($gzip) && defined $config{gzip}) {
223 $gzip = $config{gzip};
224 };
225
226 # Jobs
227 if (!defined($jobs) && defined $config{jobs}) {
228 $jobs = $config{jobs};
229 };
230
Akron263274c2019-02-07 09:48:30 +0100231 # Koral version
232 if (!defined($koral) && defined $config{koral}) {
233 $koral = $config{koral};
234 };
235
Akron63f20d42017-04-10 23:40:29 +0200236 # Input root base directory
237 if (!defined($input_base) && defined $config{'input-base'}) {
238 $input_base = $config{'input-base'};
239 };
240
Akron81500102017-04-07 20:45:44 +0200241 # temporary-extract
242 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
243 $extract_dir = $config{'temporary-extract'};
244 };
245
Akron636aa112017-04-07 18:48:56 +0200246 # Token base
247 if (!defined($token_base) && defined $config{token}) {
248 $token_base = $config{token};
249 };
250
Akroned9baf02019-01-22 17:03:25 +0100251 # temporary-extract
252 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
253 $non_word_tokens = $config{'non-word-tokens'};
254 };
255
Akron636aa112017-04-07 18:48:56 +0200256 # Cache file
257 if (!defined($cache_file) && defined $config{cache}) {
258 $cache_file = $config{cache};
259 };
260
261 # Cache size
262 if (!defined($cache_size) && defined $config{'cache-size'}) {
263 $cache_size = $config{'cache-size'};
264 };
265
266 # Cache delete
267 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
268 $cache_delete = $config{'cache-delete'} ;
269 };
270
271 # Cache init
272 if (!(defined $cache_init) && defined $config{'cache-init'}) {
273 $cache_init = $config{'cache-init'} ;
274 };
275
Akron9ec88872017-04-12 16:29:06 +0200276 # Jobs for extraction
277 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
278 $sequential_extraction = $config{'sequential-extraction'} ;
279 };
280
Akron636aa112017-04-07 18:48:56 +0200281 # Meta
282 if (!(defined $meta) && defined $config{'meta'}) {
283 $meta = $config{'meta'} ;
284 };
285
286 # Output
287 if (!(defined $output) && defined $config{'output'}) {
288 $output = $config{'output'} ;
289 };
290
291 # Base-sentences
292 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
293 $base_sentences = $config{'base-sentences'} ;
294 };
295
296 # Base-paragraphs
297 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
298 $base_paragraphs = $config{'base-paragraphs'} ;
299 };
300
301 # Base-pagebreaks
302 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
303 $base_pagebreaks = $config{'base-pagebreaks'} ;
304 };
305
Akron081639e2017-04-21 19:01:39 +0200306 # Write to tar
307 if (!(defined $to_tar) && defined $config{'to-tar'}) {
308 $to_tar = $config{'to-tar'} ;
309 };
310
Akron636aa112017-04-07 18:48:56 +0200311 # Log
312 if (!(defined $log_level) && defined $config{'log'}) {
313 $log_level = $config{'log'} ;
314 };
315
316 # Skip
317 if (!scalar(@skip) && defined $config{'skip'}) {
318 @skip = split /\s*;\s*/, $config{'skip'} ;
319 };
320
321 # Sigle
322 if (!scalar(@sigle) && defined $config{'sigle'}) {
323 @sigle = split /\s*;\s*/, $config{'sigle'} ;
324 };
325
326 # Anno
327 if (!scalar(@anno) && defined $config{'anno'}) {
328 @anno = split /\s*;\s*/, $config{'anno'} ;
329 };
330};
331
Akron63f20d42017-04-10 23:40:29 +0200332
Akron636aa112017-04-07 18:48:56 +0200333# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200334$token_base //= 'OpenNLP#tokens';
335$cache_file //= 'korapxml2krill.cache';
336$cache_size //= '50m';
337$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100338$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200339$cache_delete //= 1;
340$cache_init //= 1;
341$sequential_extraction //= 0;
342$log_level //= 'ERROR';
343$base_sentences //= '';
344$base_paragraphs //= '';
345$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100346$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200347
Akron821db3d2017-04-06 21:19:31 +0200348$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100349$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100350$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100351
Akron63f20d42017-04-10 23:40:29 +0200352
353# Initialize log4perl object
354Log::Log4perl->init({
355 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
356 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
357 'log4perl.appender.STDERR.layout' => 'PatternLayout',
358 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
359});
360
361my $log = Log::Log4perl->get_logger('main');
362
363
364print "Reading config from $cfg_file\n" if $cfg_file;
365
366
Akron941c1a62016-02-23 17:41:41 +0100367my %ERROR_HASH = (
368 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200369 -verbose => 99,
370 -msg => $VERSION_MSG,
371 -output => '-',
372 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100373);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000374
Akron941c1a62016-02-23 17:41:41 +0100375# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100376pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000377
Akrone1dbc382016-07-08 22:24:52 +0200378# Gzip has no effect, if no output is given
379pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000380
Akronc11f7982017-02-21 21:20:14 +0100381
Akron636aa112017-04-07 18:48:56 +0200382if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100383 state $cores = Sys::Info->new->device('CPU')->count;
384 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200385 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100386};
387
Akron821db3d2017-04-06 21:19:31 +0200388
Akron63f20d42017-04-10 23:40:29 +0200389# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200390if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200391
Akron486f9ab2017-04-22 23:25:19 +0200392 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200393 $log->error("Directory '$output' does not exist.");
394 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200395 };
396
397 # Remove all inputs
398 my $remove_next = 0;
399 @keep_argv = @{c(@keep_argv)->grep(
400 sub {
401 # Input flag
402 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
403 $remove_next = 1;
404 return 0;
405 }
406
407 # input value
408 elsif ($remove_next) {
409 $remove_next = 0;
410 return 0;
411 };
412
413 # Pass parameter
414 return 1;
415 }
416 )->to_array};
417
418
419 # Iterate over all inputs
420 foreach (@input) {
421
Akron081639e2017-04-21 19:01:39 +0200422 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200423 my $new_out = catdir($output, get_file_name_from_glob($_));
424
Akron486f9ab2017-04-22 23:25:19 +0200425 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200426 unless ($to_tar) {
427 if (make_path($new_out) == 0 && !-d $new_out) {
428 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200429 exit 1;
Akron081639e2017-04-21 19:01:39 +0200430 };
Akron63f20d42017-04-10 23:40:29 +0200431 };
432
433 # Create archive command
434 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
435 print "Start serial processing of $_ to $new_out\n";
436
437 # Start archiving
438 system @archive_cmd;
439 };
440
Akron3abc03e2017-06-29 16:23:35 +0200441 exit;
Akron63f20d42017-04-10 23:40:29 +0200442};
443
Akrone1dbc382016-07-08 22:24:52 +0200444my %skip;
445$skip{lc($_)} = 1 foreach @skip;
446
447my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100448push(@layers, ['Base', 'Sentences']) unless $base_sentences;
449push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200450
451# Connexor
452push(@layers, ['Connexor', 'Morpho']);
453push(@layers, ['Connexor', 'Syntax']);
454push(@layers, ['Connexor', 'Phrase']);
455push(@layers, ['Connexor', 'Sentences']);
456
457# CoreNLP
458push(@layers, ['CoreNLP', 'NamedEntities']);
459push(@layers, ['CoreNLP', 'Sentences']);
460push(@layers, ['CoreNLP', 'Morpho']);
461push(@layers, ['CoreNLP', 'Constituency']);
462
Akronce125b62017-06-19 11:54:36 +0200463# CMC
464push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100465
Akrone1dbc382016-07-08 22:24:52 +0200466# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100467my @dereko_attr = ();
468if ($base_sentences eq 'dereko#structure') {
469 push @dereko_attr, 'sentences';
470};
471if ($base_paragraphs eq 'dereko#structure') {
472 push @dereko_attr, 'paragraphs';
473};
Akron636bd9c2017-02-09 17:13:00 +0100474
Akron41ac10b2017-02-08 22:47:25 +0100475if ($base_pagebreaks eq 'dereko#structure') {
476 push @dereko_attr, 'pagebreaks';
477};
478
479if ($dereko_attr[0]) {
480 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100481}
482else {
483 push(@layers, ['DeReKo', 'Structure']);
484};
Akrone1dbc382016-07-08 22:24:52 +0200485
486# Glemm
487push(@layers, ['Glemm', 'Morpho']);
488
Akronea1aed52018-07-19 14:43:34 +0200489# HNC
490push(@layers, ['HNC', 'Morpho']);
491
Akron4c679192018-01-16 17:41:49 +0100492# LWC
493push(@layers, ['LWC', 'Dependency']);
494
Akrone1dbc382016-07-08 22:24:52 +0200495# Malt
496push(@layers, ['Malt', 'Dependency']);
497
498# MDParser
499push(@layers, ['MDParser', 'Dependency']);
500
501# Mate
502push(@layers, ['Mate', 'Morpho']);
503push(@layers, ['Mate', 'Dependency']);
504
505# OpenNLP
506push(@layers, ['OpenNLP', 'Morpho']);
507push(@layers, ['OpenNLP', 'Sentences']);
508
509# Schreibgebrauch
510push(@layers, ['Sgbr', 'Lemma']);
511push(@layers, ['Sgbr', 'Morpho']);
512
513# TreeTagger
514push(@layers, ['TreeTagger', 'Morpho']);
515push(@layers, ['TreeTagger', 'Sentences']);
516
517# XIP
518push(@layers, ['XIP', 'Morpho']);
519push(@layers, ['XIP', 'Constituency']);
520push(@layers, ['XIP', 'Sentences']);
521push(@layers, ['XIP', 'Dependency']);
522
Akron4fa37c32017-01-20 14:43:10 +0100523# DRuKoLa
524push(@layers, ['DRuKoLa', 'Morpho']);
525
Akron3bd942f2017-02-20 20:09:14 +0100526# Marmot
527push(@layers, ['MarMoT', 'Morpho']);
528
Akron4fa37c32017-01-20 14:43:10 +0100529
Akrone1dbc382016-07-08 22:24:52 +0200530# Check filters
531my @filtered_anno;
532if ($skip{'#all'}) {
533 foreach (@anno) {
534 push @filtered_anno, [ split('#', $_) ];
535 };
536}
537
538# Add all annotations that are not skipped
539else {
540 # Add to index file - respect skipping
541 foreach my $info (@layers) {
542 # Skip if Foundry or Foundry#Layer should be skipped
543 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
544 push @filtered_anno, $info;
545 };
546 };
547};
548
549# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200550my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
551
552# Remove file extension
553$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200554
555# TODO: This should not be initialized for batch
556my $cache = Cache::FastMmap->new(
557 share_file => $cache_file,
558 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200559 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200560);
561
Akron03b24db2016-08-16 20:54:32 +0200562# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200563my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200564 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200565 meta_type => $meta,
566 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200567 foundry => $token_base_foundry,
568 layer => $token_base_layer,
569 gzip => $gzip,
570 log => $log,
Akron263274c2019-02-07 09:48:30 +0100571 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200572 primary => $primary,
573 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100574 anno => \@filtered_anno,
575 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200576);
577
Akron941c1a62016-02-23 17:41:41 +0100578# Get file name based on path information
579sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100580 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200581 if (-d $i) {
582 $i =~ s![^\/]+$!!;
583 };
Akron941c1a62016-02-23 17:41:41 +0100584 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200585
586 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200587 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100588 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100589 $file =~ tr/\//-/;
590 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200591 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100592 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000593};
594
Akron63f20d42017-04-10 23:40:29 +0200595
596sub get_file_name_from_glob ($) {
597 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200598 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200599 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
600 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
601 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
602 $glob =~ s/^-//; # Clean beginning
603 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200604 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200605 return $glob;
606};
607
608
Akrone10ad322016-02-27 10:54:26 +0100609# Convert sigle to path construct
610s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
611
Akron7d4cdd82016-08-17 21:39:45 +0200612if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200613 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200614 $log->error("Directory '$output' does not exist.");
615 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200616 };
617};
618
Akron63f20d42017-04-10 23:40:29 +0200619
620# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200621if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200622
Akron821db3d2017-04-06 21:19:31 +0200623 my @new_input = ();
624
625 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200626 foreach my $wild_card (@input) {
627
628 # Prefix with input root
629 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
630
631 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200632 };
633
Akron63f20d42017-04-10 23:40:29 +0200634 # Sort files by length
635 @input = sort { length($a) <=> length($b) } @new_input;
636
637 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200638};
639
640
Akron941c1a62016-02-23 17:41:41 +0100641# Process a single file
642unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100643 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000644
Akron941c1a62016-02-23 17:41:41 +0100645 BEGIN {
646 $main::TIME = Benchmark->new;
647 $main::LAST_STOP = Benchmark->new;
648 };
649
650 sub stop_time {
651 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200652 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100653 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200654 timestr(timediff($new, $main::LAST_STOP)) .
655 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
656 );
Akron941c1a62016-02-23 17:41:41 +0100657 $main::LAST_STOP = $new;
658 };
659
660 # Create and parse new document
661 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100662
Akron7d4cdd82016-08-17 21:39:45 +0200663 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200664 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100665
Akron11c80302016-03-18 19:44:43 +0100666 # Delete cache file
667 unlink($cache_file) if $cache_delete;
668
Akron5f51d422016-08-16 16:26:43 +0200669 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200670 exit;
Akron81500102017-04-07 20:45:44 +0200671};
672
Nils Diewald59094f22014-11-05 18:20:50 +0000673
Akrone10ad322016-02-27 10:54:26 +0100674# Extract XML files
Akron81500102017-04-07 20:45:44 +0200675if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100676
Akrond5643ad2017-07-04 20:27:13 +0200677 # Output is required
678 pod2usage(%ERROR_HASH) unless $output;
679
Akron7d4cdd82016-08-17 21:39:45 +0200680 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200681 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100682
Akron7d4cdd82016-08-17 21:39:45 +0200683 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100684 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200685 $log->error("Unzip is not installed or incompatible.");
686 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100687 };
688
Akronb0c88db2016-06-29 16:33:18 +0200689 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200690 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200691
Akron651cb8d2016-08-16 21:44:49 +0200692 my $prefix = 1;
693
Akron03b24db2016-08-16 20:54:32 +0200694 # No sigles given
695 unless (@sigle) {
696
697 # Get files
698 foreach ($archive->list_texts) {
699
700 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200701 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200702
703 # TODO: Make this OS independent
704 push @sigle, join '/', $corpus, $doc, $text;
705 };
Akron20807582016-10-26 17:11:34 +0200706 }
707
708 # Check sigle for doc sigles
709 else {
710 my @new_sigle;
711
712 my $prefix_check = 0;
713
714 # Iterate over all sigle
715 foreach (@sigle) {
716
717 # Sigle is a doc sigle
718 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200719
Akron60a8caa2017-02-17 21:51:27 +0100720 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200721 # Check if a prefix is needed
722 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100723
724 if ($prefix = $archive->check_prefix) {
725 print " with prefix ...";
726 };
Akron20807582016-10-26 17:11:34 +0200727 $prefix_check = 1;
728 };
729
Akron60a8caa2017-02-17 21:51:27 +0100730 print "\n";
731
Akron20807582016-10-26 17:11:34 +0200732 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200733 my $path = ($prefix ? './' : '') . $_;
734
735 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200736 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200737 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200738 ) ? '' : 'not '
739 );
740 print "extracted.\n";
741 }
Akron60a8caa2017-02-17 21:51:27 +0100742
743 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200744 else {
745 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100746
747 unless ($prefix_check) {
748
749 if ($prefix = $archive->check_prefix) {
750 print " with prefix ...";
751 };
752 $prefix_check = 1;
753 };
Akron20807582016-10-26 17:11:34 +0200754 };
755 };
756 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200757 };
758
Akrone10ad322016-02-27 10:54:26 +0100759 # Iterate over all given sigles and extract
760 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100761
Akron2812ba22016-10-28 21:55:59 +0200762 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200763
Akron03b24db2016-08-16 20:54:32 +0200764 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200765 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100766
Akron20807582016-10-26 17:11:34 +0200767 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200768 ($prefix ? './' : '') . $_, $output
769 ) ? '' : 'not '
770 );
Akrone10ad322016-02-27 10:54:26 +0100771 print "extracted.\n";
772 };
Akronb0c88db2016-06-29 16:33:18 +0200773 }
Akron7d4cdd82016-08-17 21:39:45 +0200774
775 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200776 else {
777 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200778 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100779 };
780}
781
Akron81500102017-04-07 20:45:44 +0200782
Akron941c1a62016-02-23 17:41:41 +0100783# Process an archive
784elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000785
Akron81500102017-04-07 20:45:44 +0200786 my $archive_output;
787
788 # First extract, then archive
789 if (defined $extract_dir) {
790
791 # Create new archive object
792 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
793
794 # Check zip capabilities
795 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200796 $log->error("Unzip is not installed or incompatible.");
797 exit 1;
Akron81500102017-04-07 20:45:44 +0200798 };
799
800 # Add further annotation archived
801 $archive->attach($_) foreach @input[1..$#input];
802
803 # Create a temporary directory
804 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200805 $extract_dir = tempdir(CLEANUP => 0);
806 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200807 };
808
Akron63f20d42017-04-10 23:40:29 +0200809 # Add some random extra to avoid clashes with multiple archives
810 $extract_dir = catdir($extract_dir, random_string('cccccc'));
811
812 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200813 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200814 @input = ($extract_dir);
815 }
816 else {
817 $log->error('Unable to extract from primary archive ' . $input[0] .
818 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200819 exit 1;
Akron81500102017-04-07 20:45:44 +0200820 };
821 }
822
823 # Can't create archive object
824 else {
825 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200826 exit 1;
Akron81500102017-04-07 20:45:44 +0200827 };
828 };
829
Akrone1dbc382016-07-08 22:24:52 +0200830 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100831
Akron7d4cdd82016-08-17 21:39:45 +0200832 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100833 my $pool = Parallel::ForkManager->new($jobs);
834
Akron7d4cdd82016-08-17 21:39:45 +0200835 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100836 my $iter = 1; # Current text in process
837
Akronda3097e2017-04-23 19:53:57 +0200838 my $tar_archive;
839 my $output_dir = $output;
840 my $tar_fh;
841
842 # Initialize tar archive
843 if ($to_tar) {
844 $tar_archive = Archive::Tar::Builder->new(
845 ignore_errors => 1
846 );
847
848 # Set output name
849 my $tar_file = $output;
850 unless ($tar_file =~ /\.tar$/) {
851 $tar_file .= '.tar';
852 };
853
854 # Initiate the tar file
855 print "Writing to file $tar_file\n";
856 $tar_fh = IO::File->new($tar_file, 'w');
857 $tar_fh->binmode(1);
858
859 # Set handle
860 $tar_archive->set_handle($tar_fh);
861
862 # Output to temporary directory
863 $output_dir = File::Temp->newdir;
864 };
865
Akron941c1a62016-02-23 17:41:41 +0100866 # Report on fork message
867 $pool->run_on_finish (
868 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200869 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100870 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200871
Akron08385f62016-03-22 20:37:04 +0100872 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200873 ($iter++) . "/$count]" .
874 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200875 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200876
877 if (!$code && $to_tar && $data->[2]) {
878 my $filename = $data->[2];
879
880 # Lock filehandle
881 if (flock($tar_fh, LOCK_EX)) {
882
Akron9a062ce2017-07-04 19:12:05 +0200883 my $clean_file = fileparse($filename);
884
Akronda3097e2017-04-23 19:53:57 +0200885 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200886 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200887 unlink $filename;
888
889 # Unlock filehandle
890 flock($tar_fh, LOCK_UN);
891 }
892 else {
893 $log->warn("Unable to add $filename to archive");
894 };
895 };
896
Akron4c0cf312016-10-15 16:42:09 +0200897 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100898 }
899 );
900
901 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200902 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100903 print "Reading data ...\n";
904
Akron7d4cdd82016-08-17 21:39:45 +0200905 # unless (Cache::FastMmap->new(
906 # share_file => $cache_file,
907 # cache_size => $cache_size,
908 # init_file => $cache_init
909 # )) {
910 # print "Unable to intialize cache '$cache_file'\n\n";
911 # exit(1);
912 # };
Akron11c80302016-03-18 19:44:43 +0100913
Akron486f9ab2017-04-22 23:25:19 +0200914
Akron941c1a62016-02-23 17:41:41 +0100915 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100916 if (-d $input[0]) {
917 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100918 my @dirs;
919 my $dir;
920
Akron7d4cdd82016-08-17 21:39:45 +0200921 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100922 while (1) {
923 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200924 push @dirs, $dir;
925 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100926 };
927 last unless $it->next;
928 };
929
930 print "Start processing ...\n";
931 $t = Benchmark->new;
932 $count = scalar @dirs;
933
934 DIRECTORY_LOOP:
935 for (my $i = 0; $i < $count; $i++) {
936
Akrone1dbc382016-07-08 22:24:52 +0200937 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200938 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200939 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200940 );
Akron941c1a62016-02-23 17:41:41 +0100941
942 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200943 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200944
Akron13d56622016-10-31 14:54:49 +0100945 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200946 $pool->finish(
947 0,
Akronda3097e2017-04-23 19:53:57 +0200948 [
949 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
950 undef,
951 $filename
952 ]
Akron486f9ab2017-04-22 23:25:19 +0200953 );
Akron3ec48972016-08-17 23:24:52 +0200954 }
955 else {
Akron4c0cf312016-10-15 16:42:09 +0200956 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200957 };
Akron941c1a62016-02-23 17:41:41 +0100958 };
959 }
960
961 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200962 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200963
Akron941c1a62016-02-23 17:41:41 +0100964 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200965 $log->error("Unzip is not installed or incompatible.");
966 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100967 };
968
Akron08385f62016-03-22 20:37:04 +0100969 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200970 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100971
Akron941c1a62016-02-23 17:41:41 +0100972 print "Start processing ...\n";
973 $t = Benchmark->new;
974 my @dirs = $archive->list_texts;
975 $count = scalar @dirs;
976
977 ARCHIVE_LOOP:
978 for (my $i = 0; $i < $count; $i++) {
979
980 # Split path information
981 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
982
Akrone1dbc382016-07-08 22:24:52 +0200983 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200984 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200985 get_file_name(
986 catfile($corpus, $doc, $text)
987 . '.json' . ($gzip ? '.gz' : '')
988 )
Akrone1dbc382016-07-08 22:24:52 +0200989 );
Akron941c1a62016-02-23 17:41:41 +0100990
991 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200992 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100993
Akron4c0cf312016-10-15 16:42:09 +0200994 # Create temporary file
995 $temp = File::Temp->newdir;
996
Akronbdf434a2016-10-24 17:42:07 +0200997 # TODO: Check if $filename exist at the beginning,
998 # because extraction can be horrible slow!
999
Akron941c1a62016-02-23 17:41:41 +01001000 # Extract from archive
Akron20807582016-10-26 17:11:34 +02001001 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +01001002
Akron7d4cdd82016-08-17 21:39:45 +02001003 # Create corpus directory
1004 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +01001005
Akron7d4cdd82016-08-17 21:39:45 +02001006 # Temporary directory
1007 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +01001008
Akron7d4cdd82016-08-17 21:39:45 +02001009 # Write file
Akron13d56622016-10-31 14:54:49 +01001010 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +02001011
Akron4c0cf312016-10-15 16:42:09 +02001012 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +01001013 $pool->finish(
1014 0,
Akronda3097e2017-04-23 19:53:57 +02001015 [
1016 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
1017 $temp,
1018 $filename
1019 ]
Akron13d56622016-10-31 14:54:49 +01001020 );
1021 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001022 }
1023 else {
Akron4c0cf312016-10-15 16:42:09 +02001024 # Delete temporary file
1025 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001026 };
Akron941c1a62016-02-23 17:41:41 +01001027 }
Akron7d4cdd82016-08-17 21:39:45 +02001028
1029 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001030 else {
Akron4c0cf312016-10-15 16:42:09 +02001031 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001032 };
1033 };
1034 }
1035
1036 else {
1037 print "Input is neither a directory nor an archive.\n\n";
1038 };
1039
1040 $pool->wait_all_children;
1041
Akron11c80302016-03-18 19:44:43 +01001042 # Delete cache file
1043 unlink($cache_file) if $cache_delete;
1044
Akronda3097e2017-04-23 19:53:57 +02001045 # Close tar filehandle
1046 if ($to_tar && $tar_fh) {
1047 $tar_archive->finish;
1048 $tar_fh->close;
1049 print "Wrote to tar archive.\n";
1050 };
1051
Akron63f20d42017-04-10 23:40:29 +02001052 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001053 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001054};
Akron941c1a62016-02-23 17:41:41 +01001055
Nils Diewald2db9ad02013-10-29 19:26:43 +00001056
Akron63f20d42017-04-10 23:40:29 +02001057# Cleanup temporary extraction directory
1058if ($extract_dir) {
1059 my $objects = remove_tree($extract_dir, { safe => 1 });
1060 print "Removed directory $extract_dir with $objects objects.\n";
1061};
1062
1063
1064print "\n";
1065
Nils Diewald2db9ad02013-10-29 19:26:43 +00001066__END__
Akron941c1a62016-02-23 17:41:41 +01001067
1068=pod
1069
1070=encoding utf8
1071
1072=head1 NAME
1073
Akronf7ad89e2016-03-16 18:22:47 +01001074korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001075
1076
1077=head1 SYNOPSIS
1078
Akrona76d8352016-10-27 16:27:32 +02001079 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001080
Akron2fd402b2016-10-27 21:26:48 +02001081
Akron941c1a62016-02-23 17:41:41 +01001082=head1 DESCRIPTION
1083
1084L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1085compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001086The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001087
1088
1089=head1 INSTALLATION
1090
1091The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1092
Akronaf386982016-10-12 00:33:25 +02001093 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001094
Akronc13a1702016-03-15 19:33:14 +01001095In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001096be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001097Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001098In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001099
1100=head1 ARGUMENTS
1101
Akrona76d8352016-10-27 16:27:32 +02001102 $ korapxml2krill -z --input <directory> --output <filename>
1103
1104Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001105It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001106
Akron941c1a62016-02-23 17:41:41 +01001107=over 2
1108
1109=item B<archive>
1110
Akron081639e2017-04-21 19:01:39 +02001111 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001112
Akron2fd402b2016-10-27 21:26:48 +02001113Converts an archive of KorAP-XML documents. It expects a directory
1114(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001115
1116=item B<extract>
1117
Akrona76d8352016-10-27 16:27:32 +02001118 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1119
1120Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001121
Akron63f20d42017-04-10 23:40:29 +02001122=item B<serial>
1123
1124 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1125
1126Convert archives sequentially. The inputs are not merged but treated
1127as they are (so they may be premerged or globs).
1128the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001129are created based on the archive name. In case the C<--to-tar> flag is given,
1130the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001131
1132
Akron941c1a62016-02-23 17:41:41 +01001133=back
1134
1135
1136=head1 OPTIONS
1137
1138=over 2
1139
Akrona76d8352016-10-27 16:27:32 +02001140=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001141
Akrona76d8352016-10-27 16:27:32 +02001142Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001143
Akron7606afa2016-10-25 16:23:49 +02001144Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001145document, while C<archive> expects a KorAP-XML corpus folder or a zip
1146file to batch process multiple files.
1147C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001148
Akrona76d8352016-10-27 16:27:32 +02001149C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001150that the first archive listed contains all primary data files
1151and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001152
Akron7606afa2016-10-25 16:23:49 +02001153 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001154
Akron821db3d2017-04-06 21:19:31 +02001155Input may also be defined using BSD glob wildcards.
1156
1157 -i 'file/news*.zip'
1158
1159The extended input array will be sorted in length order, so the shortest
1160path needs to contain all primary data files and all meta data files.
1161
Akron0c3e3752016-06-28 15:55:53 +02001162(The directory structure follows the base directory format,
1163that may include a C<.> root folder.
1164In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001165need to be passed with a hash sign in front of the archive's name.
1166This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001167
Akron7606afa2016-10-25 16:23:49 +02001168To support zip files, a version of C<unzip> needs to be installed that is
1169compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001170
Akron7606afa2016-10-25 16:23:49 +02001171B<The root folder switch using the hash sign is experimental and
1172may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001173
Akronf73ffb62018-06-27 12:13:59 +02001174
Akron63f20d42017-04-10 23:40:29 +02001175=item B<--input-base|-ib> <directory>
1176
1177The base directory for inputs.
1178
1179
Akron941c1a62016-02-23 17:41:41 +01001180=item B<--output|-o> <directory|file>
1181
1182Output folder for archive processing or
1183document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001184writes to C<STDOUT> by default
1185(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001186
1187=item B<--overwrite|-w>
1188
1189Overwrite files that already exist.
1190
Akronf73ffb62018-06-27 12:13:59 +02001191
Akron3741f8b2016-12-21 19:55:21 +01001192=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001193
1194Define the default tokenization by specifying
1195the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001196of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001197
Akron3741f8b2016-12-21 19:55:21 +01001198
1199=item B<--base-sentences|-bs> <foundry>#<layer>
1200
1201Define the layer for base sentences.
1202If given, this will be used instead of using C<Base#Sentences>.
1203Currently C<DeReKo#Structure> is the only additional layer supported.
1204
1205 Defaults to unset.
1206
1207
1208=item B<--base-paragraphs|-bp> <foundry>#<layer>
1209
1210Define the layer for base paragraphs.
1211If given, this will be used instead of using C<Base#Paragraphs>.
1212Currently C<DeReKo#Structure> is the only additional layer supported.
1213
1214 Defaults to unset.
1215
1216
Akron41ac10b2017-02-08 22:47:25 +01001217=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1218
1219Define the layer for base pagebreaks.
1220Currently C<DeReKo#Structure> is the only layer supported.
1221
1222 Defaults to unset.
1223
1224
Akron941c1a62016-02-23 17:41:41 +01001225=item B<--skip|-s> <foundry>[#<layer>]
1226
Akronf7ad89e2016-03-16 18:22:47 +01001227Skip specific annotations by specifying the foundry
1228(and optionally the layer with a C<#>-prefix),
1229e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001230Can be set multiple times.
1231
Akronf73ffb62018-06-27 12:13:59 +02001232
Akronc13a1702016-03-15 19:33:14 +01001233=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001234
Akronf7ad89e2016-03-16 18:22:47 +01001235Convert specific annotations by specifying the foundry
1236(and optionally the layer with a C<#>-prefix),
1237e.g. C<Mate> or C<Mate#Morpho>.
1238Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001239
Akronf73ffb62018-06-27 12:13:59 +02001240
Akron941c1a62016-02-23 17:41:41 +01001241=item B<--primary|-p>
1242
Akronc13a1702016-03-15 19:33:14 +01001243Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001244Can be flagged using C<--no-primary> as well.
1245This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001246
Akronf73ffb62018-06-27 12:13:59 +02001247
Akroned9baf02019-01-22 17:03:25 +01001248=item B<--non-word-tokens|-nwt>
1249
1250Tokenize non-word tokens like word tokens (defined as matching
1251C</[\d\w]/>). Useful to treat punctuations as tokens.
1252
1253 Defaults to unset.
1254
Akron941c1a62016-02-23 17:41:41 +01001255=item B<--jobs|-j>
1256
1257Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001258for archive processing.
Akron11c80302016-03-18 19:44:43 +01001259Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001260
1261If C<sequential-extraction> is not set to false, this will
1262also apply to extraction.
1263
Akronc11f7982017-02-21 21:20:14 +01001264Pass -1, and the value will be set automatically to 5
1265times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001266This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001267
Akronf73ffb62018-06-27 12:13:59 +02001268
Akron263274c2019-02-07 09:48:30 +01001269=item B<--koral|-k>
1270
1271Version of the output format. Supported versions are:
1272C<0> for legacy serialization, C<0.03> for serialization
1273with metadata fields as key-values on the root object,
1274C<0.4> for serialization with metadata fields as a list
1275of C<"@type":"koral:field"> objects.
1276
1277Currently defaults to C<0.03>.
1278
1279
Akron9ec88872017-04-12 16:29:06 +02001280=item B<--sequential-extraction|-se>
1281
1282Flag to indicate, if the C<jobs> value also applies to extraction.
1283Some systems may have problems with extracting multiple archives
1284to the same folder at the same time.
1285Can be flagged using C<--no-sequential-extraction> as well.
1286Defaults to C<false>.
1287
Akronf73ffb62018-06-27 12:13:59 +02001288
Akron35db6e32016-03-17 22:42:22 +01001289=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001290
Akron35db6e32016-03-17 22:42:22 +01001291Define the metadata parser to use. Defaults to C<I5>.
1292Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1293This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001294
Akronf73ffb62018-06-27 12:13:59 +02001295
Akron941c1a62016-02-23 17:41:41 +01001296=item B<--pretty|-y>
1297
Akronc13a1702016-03-15 19:33:14 +01001298Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001299This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001300
Akronf73ffb62018-06-27 12:13:59 +02001301
Akron941c1a62016-02-23 17:41:41 +01001302=item B<--gzip|-z>
1303
Akronf7ad89e2016-03-16 18:22:47 +01001304Compress the output.
1305Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001306
Akronf73ffb62018-06-27 12:13:59 +02001307
Akron11c80302016-03-18 19:44:43 +01001308=item B<--cache|-c>
1309
1310File to mmap a cache (using L<Cache::FastMmap>).
1311Defaults to C<korapxml2krill.cache> in the calling directory.
1312
Akronf73ffb62018-06-27 12:13:59 +02001313
Akron11c80302016-03-18 19:44:43 +01001314=item B<--cache-size|-cs>
1315
1316Size of the cache. Defaults to C<50m>.
1317
Akronf73ffb62018-06-27 12:13:59 +02001318
Akron11c80302016-03-18 19:44:43 +01001319=item B<--cache-init|-ci>
1320
1321Initialize cache file.
1322Can be flagged using C<--no-cache-init> as well.
1323Defaults to C<true>.
1324
Akronf73ffb62018-06-27 12:13:59 +02001325
Akron11c80302016-03-18 19:44:43 +01001326=item B<--cache-delete|-cd>
1327
1328Delete cache file after processing.
1329Can be flagged using C<--no-cache-delete> as well.
1330Defaults to C<true>.
1331
Akronf73ffb62018-06-27 12:13:59 +02001332
Akron636aa112017-04-07 18:48:56 +02001333=item B<--config|-cfg>
1334
1335Configure the parameters of your call in a file
1336of key-value pairs with whitespace separator
1337
1338 overwrite 1
1339 token DeReKo#Structure
1340 ...
1341
1342Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001343C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001344C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001345C<output>,
1346C<temp-extract>, C<sequential-extraction>,
1347C<base-sentences>, C<base-paragraphs>,
1348C<base-pagebreaks>,
1349C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001350(semicolon separated), C<anno> (semicolon separated).
1351
Akronf73ffb62018-06-27 12:13:59 +02001352Configuration parameters will always be overwritten by
1353passed parameters.
1354
1355
Akron81500102017-04-07 20:45:44 +02001356=item B<--temporary-extract|-te>
1357
1358Only valid for the C<archive> command.
1359
1360This will first extract all files into a
1361directory and then will archive.
1362If the directory is given as C<:temp:>,
1363a temporary directory is used.
1364This is especially useful to avoid
1365massive unzipping and potential
1366network latency.
Akron636aa112017-04-07 18:48:56 +02001367
Akronf73ffb62018-06-27 12:13:59 +02001368
Akrone10ad322016-02-27 10:54:26 +01001369=item B<--sigle|-sg>
1370
Akron20807582016-10-26 17:11:34 +02001371Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001372Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001373I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001374Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001375In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001376On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001377
Akronf73ffb62018-06-27 12:13:59 +02001378
Akron941c1a62016-02-23 17:41:41 +01001379=item B<--log|-l>
1380
1381The L<Log4perl> log level, defaults to C<ERROR>.
1382
Akronf73ffb62018-06-27 12:13:59 +02001383
Akron941c1a62016-02-23 17:41:41 +01001384=item B<--help|-h>
1385
1386Print this document.
1387
Akronf73ffb62018-06-27 12:13:59 +02001388
Akron941c1a62016-02-23 17:41:41 +01001389=item B<--version|-v>
1390
1391Print version information.
1392
1393=back
1394
Akronf73ffb62018-06-27 12:13:59 +02001395
Akronc13a1702016-03-15 19:33:14 +01001396=head1 ANNOTATION SUPPORT
1397
1398L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1399developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1400The base foundry with paragraphs, sentences, and the text element are mandatory for
1401L<Krill|https://github.com/KorAP/Krill>.
1402
Akron821db3d2017-04-06 21:19:31 +02001403 Base
1404 #Paragraphs
1405 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001406
Akron821db3d2017-04-06 21:19:31 +02001407 Connexor
1408 #Morpho
1409 #Phrase
1410 #Sentences
1411 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001412
Akron821db3d2017-04-06 21:19:31 +02001413 CoreNLP
1414 #Constituency
1415 #Morpho
1416 #NamedEntities
1417 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001418
Akronce125b62017-06-19 11:54:36 +02001419 CMC
1420 #Morpho
1421
Akron821db3d2017-04-06 21:19:31 +02001422 DeReKo
1423 #Structure
Akronc13a1702016-03-15 19:33:14 +01001424
Akron821db3d2017-04-06 21:19:31 +02001425 DRuKoLa
1426 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001427
Akron821db3d2017-04-06 21:19:31 +02001428 Glemm
1429 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001430
Akronea1aed52018-07-19 14:43:34 +02001431 HNC
1432 #Morpho
1433
Akron4c679192018-01-16 17:41:49 +01001434 LWC
1435 #Dependency
1436
Akron821db3d2017-04-06 21:19:31 +02001437 Malt
1438 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001439
Akron821db3d2017-04-06 21:19:31 +02001440 MarMoT
1441 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001442
Akron821db3d2017-04-06 21:19:31 +02001443 Mate
1444 #Dependency
1445 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001446
Akron821db3d2017-04-06 21:19:31 +02001447 MDParser
1448 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001449
Akron821db3d2017-04-06 21:19:31 +02001450 OpenNLP
1451 #Morpho
1452 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001453
Akron821db3d2017-04-06 21:19:31 +02001454 Sgbr
1455 #Lemma
1456 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001457
Akron821db3d2017-04-06 21:19:31 +02001458 TreeTagger
1459 #Morpho
1460 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001461
Akron821db3d2017-04-06 21:19:31 +02001462 XIP
1463 #Constituency
1464 #Morpho
1465 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001466
Akronc13a1702016-03-15 19:33:14 +01001467
1468More importers are in preparation.
1469New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1470See the built-in annotation importers as examples.
1471
Akronf73ffb62018-06-27 12:13:59 +02001472
Akron941c1a62016-02-23 17:41:41 +01001473=head1 AVAILABILITY
1474
1475 https://github.com/KorAP/KorAP-XML-Krill
1476
1477
1478=head1 COPYRIGHT AND LICENSE
1479
Akroned9baf02019-01-22 17:03:25 +01001480Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001481
Akron941c1a62016-02-23 17:41:41 +01001482Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001483
Akrona76d8352016-10-27 16:27:32 +02001484Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001485
1486L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1487Corpus Analysis Platform at the
1488L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1489member of the
1490L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1491
1492This program is free software published under the
1493L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1494
1495=cut