blob: 353028804368a176f7bf7bd4f6c6d6439509e9b8 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron941c1a62016-02-23 17:41:41 +0100141# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100142
Akron263274c2019-02-07 09:48:30 +0100143our $LAST_CHANGE = '2019/02/07';
Akron941c1a62016-02-23 17:41:41 +0100144our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100145our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100146our $VERSION_MSG = <<"VERSION";
147Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
148VERSION
149
Akron63f20d42017-04-10 23:40:29 +0200150# Prototypes
151sub get_file_name_from_glob($);
152sub get_file_name($);
153
Akron941c1a62016-02-23 17:41:41 +0100154# Parse comand
155my $cmd;
156our @ARGV;
157if ($ARGV[0] && index($ARGV[0], '-') != 0) {
158 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100159};
Akron63f20d42017-04-10 23:40:29 +0200160my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100161
Akron5f51d422016-08-16 16:26:43 +0200162my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100163my $text;
Akrone10ad322016-02-27 10:54:26 +0100164
Akron941c1a62016-02-23 17:41:41 +0100165# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000166GetOptions(
Akron08385f62016-03-22 20:37:04 +0100167 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200168 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100169 'output|o=s' => \(my $output),
170 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100171 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200172 'token|t=s' => \(my $token_base),
173 'base-sentences|bs=s' => \(my $base_sentences),
174 'base-paragraphs|bp=s' => \(my $base_paragraphs),
175 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100176 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200177 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100178 'skip|s=s' => \@skip,
179 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200180 'cache|c=s' => \(my $cache_file),
181 'config|cfg=s' => \(my $cfg_file),
182 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200183 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100184 'primary|p!' => \(my $primary),
185 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200186 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100187 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200188 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100189 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200190 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200191 'cache-size|cs=s' => \(my $cache_size),
192 'cache-delete|cd!' => \(my $cache_delete),
193 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100194 'help|h' => sub {
195 pod2usage(
196 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200197 -verbose => 99,
198 -msg => $VERSION_MSG,
199 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100200 );
201 },
202 'version|v' => sub {
203 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200204 -verbose => 0,
205 -msg => $VERSION_MSG,
206 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100207 )
208 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000209);
210
Akron63f20d42017-04-10 23:40:29 +0200211
Akron636aa112017-04-07 18:48:56 +0200212# Load from configuration
213if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200214 my %config;
215
216 Config::Simple->import_from($cfg_file, \%config);
217
218 # Overwrite
219 if (!defined($overwrite) && defined $config{overwrite}) {
220 $overwrite = $config{overwrite};
221 };
222
223 # Gzip
224 if (!defined($gzip) && defined $config{gzip}) {
225 $gzip = $config{gzip};
226 };
227
228 # Jobs
229 if (!defined($jobs) && defined $config{jobs}) {
230 $jobs = $config{jobs};
231 };
232
Akron263274c2019-02-07 09:48:30 +0100233 # Koral version
234 if (!defined($koral) && defined $config{koral}) {
235 $koral = $config{koral};
236 };
237
Akron63f20d42017-04-10 23:40:29 +0200238 # Input root base directory
239 if (!defined($input_base) && defined $config{'input-base'}) {
240 $input_base = $config{'input-base'};
241 };
242
Akron81500102017-04-07 20:45:44 +0200243 # temporary-extract
244 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
245 $extract_dir = $config{'temporary-extract'};
246 };
247
Akron636aa112017-04-07 18:48:56 +0200248 # Token base
249 if (!defined($token_base) && defined $config{token}) {
250 $token_base = $config{token};
251 };
252
Akroned9baf02019-01-22 17:03:25 +0100253 # temporary-extract
254 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
255 $non_word_tokens = $config{'non-word-tokens'};
256 };
257
Akron636aa112017-04-07 18:48:56 +0200258 # Cache file
259 if (!defined($cache_file) && defined $config{cache}) {
260 $cache_file = $config{cache};
261 };
262
263 # Cache size
264 if (!defined($cache_size) && defined $config{'cache-size'}) {
265 $cache_size = $config{'cache-size'};
266 };
267
268 # Cache delete
269 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
270 $cache_delete = $config{'cache-delete'} ;
271 };
272
273 # Cache init
274 if (!(defined $cache_init) && defined $config{'cache-init'}) {
275 $cache_init = $config{'cache-init'} ;
276 };
277
Akron9ec88872017-04-12 16:29:06 +0200278 # Jobs for extraction
279 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
280 $sequential_extraction = $config{'sequential-extraction'} ;
281 };
282
Akron636aa112017-04-07 18:48:56 +0200283 # Meta
284 if (!(defined $meta) && defined $config{'meta'}) {
285 $meta = $config{'meta'} ;
286 };
287
288 # Output
289 if (!(defined $output) && defined $config{'output'}) {
290 $output = $config{'output'} ;
291 };
292
293 # Base-sentences
294 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
295 $base_sentences = $config{'base-sentences'} ;
296 };
297
298 # Base-paragraphs
299 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
300 $base_paragraphs = $config{'base-paragraphs'} ;
301 };
302
303 # Base-pagebreaks
304 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
305 $base_pagebreaks = $config{'base-pagebreaks'} ;
306 };
307
Akron081639e2017-04-21 19:01:39 +0200308 # Write to tar
309 if (!(defined $to_tar) && defined $config{'to-tar'}) {
310 $to_tar = $config{'to-tar'} ;
311 };
312
Akron636aa112017-04-07 18:48:56 +0200313 # Log
314 if (!(defined $log_level) && defined $config{'log'}) {
315 $log_level = $config{'log'} ;
316 };
317
318 # Skip
319 if (!scalar(@skip) && defined $config{'skip'}) {
320 @skip = split /\s*;\s*/, $config{'skip'} ;
321 };
322
323 # Sigle
324 if (!scalar(@sigle) && defined $config{'sigle'}) {
325 @sigle = split /\s*;\s*/, $config{'sigle'} ;
326 };
327
328 # Anno
329 if (!scalar(@anno) && defined $config{'anno'}) {
330 @anno = split /\s*;\s*/, $config{'anno'} ;
331 };
332};
333
Akron63f20d42017-04-10 23:40:29 +0200334
Akron636aa112017-04-07 18:48:56 +0200335# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200336$token_base //= 'OpenNLP#tokens';
337$cache_file //= 'korapxml2krill.cache';
338$cache_size //= '50m';
339$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100340$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200341$cache_delete //= 1;
342$cache_init //= 1;
343$sequential_extraction //= 0;
344$log_level //= 'ERROR';
345$base_sentences //= '';
346$base_paragraphs //= '';
347$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100348$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200349
Akron821db3d2017-04-06 21:19:31 +0200350$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100351$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100352$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100353
Akron63f20d42017-04-10 23:40:29 +0200354
355# Initialize log4perl object
356Log::Log4perl->init({
357 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
358 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
359 'log4perl.appender.STDERR.layout' => 'PatternLayout',
360 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
361});
362
363my $log = Log::Log4perl->get_logger('main');
364
365
366print "Reading config from $cfg_file\n" if $cfg_file;
367
368
Akron941c1a62016-02-23 17:41:41 +0100369my %ERROR_HASH = (
370 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200371 -verbose => 99,
372 -msg => $VERSION_MSG,
373 -output => '-',
374 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100375);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000376
Akron941c1a62016-02-23 17:41:41 +0100377# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100378pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000379
Akrone1dbc382016-07-08 22:24:52 +0200380# Gzip has no effect, if no output is given
381pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000382
Akronc11f7982017-02-21 21:20:14 +0100383
Akron636aa112017-04-07 18:48:56 +0200384if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100385 state $cores = Sys::Info->new->device('CPU')->count;
386 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200387 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100388};
389
Akron821db3d2017-04-06 21:19:31 +0200390
Akron63f20d42017-04-10 23:40:29 +0200391# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200392if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200393
Akron486f9ab2017-04-22 23:25:19 +0200394 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200395 $log->error("Directory '$output' does not exist.");
396 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200397 };
398
399 # Remove all inputs
400 my $remove_next = 0;
401 @keep_argv = @{c(@keep_argv)->grep(
402 sub {
403 # Input flag
404 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
405 $remove_next = 1;
406 return 0;
407 }
408
409 # input value
410 elsif ($remove_next) {
411 $remove_next = 0;
412 return 0;
413 };
414
415 # Pass parameter
416 return 1;
417 }
418 )->to_array};
419
420
421 # Iterate over all inputs
422 foreach (@input) {
423
Akron081639e2017-04-21 19:01:39 +0200424 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200425 my $new_out = catdir($output, get_file_name_from_glob($_));
426
Akron486f9ab2017-04-22 23:25:19 +0200427 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200428 unless ($to_tar) {
429 if (make_path($new_out) == 0 && !-d $new_out) {
430 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200431 exit 1;
Akron081639e2017-04-21 19:01:39 +0200432 };
Akron63f20d42017-04-10 23:40:29 +0200433 };
434
435 # Create archive command
436 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
437 print "Start serial processing of $_ to $new_out\n";
438
439 # Start archiving
440 system @archive_cmd;
441 };
442
Akron3abc03e2017-06-29 16:23:35 +0200443 exit;
Akron63f20d42017-04-10 23:40:29 +0200444};
445
Akrone1dbc382016-07-08 22:24:52 +0200446my %skip;
447$skip{lc($_)} = 1 foreach @skip;
448
449my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100450push(@layers, ['Base', 'Sentences']) unless $base_sentences;
451push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200452
453# Connexor
454push(@layers, ['Connexor', 'Morpho']);
455push(@layers, ['Connexor', 'Syntax']);
456push(@layers, ['Connexor', 'Phrase']);
457push(@layers, ['Connexor', 'Sentences']);
458
459# CoreNLP
460push(@layers, ['CoreNLP', 'NamedEntities']);
461push(@layers, ['CoreNLP', 'Sentences']);
462push(@layers, ['CoreNLP', 'Morpho']);
463push(@layers, ['CoreNLP', 'Constituency']);
464
Akronce125b62017-06-19 11:54:36 +0200465# CMC
466push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100467
Akrone1dbc382016-07-08 22:24:52 +0200468# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100469my @dereko_attr = ();
470if ($base_sentences eq 'dereko#structure') {
471 push @dereko_attr, 'sentences';
472};
473if ($base_paragraphs eq 'dereko#structure') {
474 push @dereko_attr, 'paragraphs';
475};
Akron636bd9c2017-02-09 17:13:00 +0100476
Akron41ac10b2017-02-08 22:47:25 +0100477if ($base_pagebreaks eq 'dereko#structure') {
478 push @dereko_attr, 'pagebreaks';
479};
480
481if ($dereko_attr[0]) {
482 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100483}
484else {
485 push(@layers, ['DeReKo', 'Structure']);
486};
Akrone1dbc382016-07-08 22:24:52 +0200487
488# Glemm
489push(@layers, ['Glemm', 'Morpho']);
490
Akronea1aed52018-07-19 14:43:34 +0200491# HNC
492push(@layers, ['HNC', 'Morpho']);
493
Akron4c679192018-01-16 17:41:49 +0100494# LWC
495push(@layers, ['LWC', 'Dependency']);
496
Akrone1dbc382016-07-08 22:24:52 +0200497# Malt
498push(@layers, ['Malt', 'Dependency']);
499
500# MDParser
501push(@layers, ['MDParser', 'Dependency']);
502
503# Mate
504push(@layers, ['Mate', 'Morpho']);
505push(@layers, ['Mate', 'Dependency']);
506
507# OpenNLP
508push(@layers, ['OpenNLP', 'Morpho']);
509push(@layers, ['OpenNLP', 'Sentences']);
510
511# Schreibgebrauch
512push(@layers, ['Sgbr', 'Lemma']);
513push(@layers, ['Sgbr', 'Morpho']);
514
515# TreeTagger
516push(@layers, ['TreeTagger', 'Morpho']);
517push(@layers, ['TreeTagger', 'Sentences']);
518
519# XIP
520push(@layers, ['XIP', 'Morpho']);
521push(@layers, ['XIP', 'Constituency']);
522push(@layers, ['XIP', 'Sentences']);
523push(@layers, ['XIP', 'Dependency']);
524
Akron4fa37c32017-01-20 14:43:10 +0100525# DRuKoLa
526push(@layers, ['DRuKoLa', 'Morpho']);
527
Akron3bd942f2017-02-20 20:09:14 +0100528# Marmot
529push(@layers, ['MarMoT', 'Morpho']);
530
Akron4fa37c32017-01-20 14:43:10 +0100531
Akrone1dbc382016-07-08 22:24:52 +0200532# Check filters
533my @filtered_anno;
534if ($skip{'#all'}) {
535 foreach (@anno) {
536 push @filtered_anno, [ split('#', $_) ];
537 };
538}
539
540# Add all annotations that are not skipped
541else {
542 # Add to index file - respect skipping
543 foreach my $info (@layers) {
544 # Skip if Foundry or Foundry#Layer should be skipped
545 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
546 push @filtered_anno, $info;
547 };
548 };
549};
550
551# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200552my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
553
554# Remove file extension
555$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200556
557# TODO: This should not be initialized for batch
558my $cache = Cache::FastMmap->new(
559 share_file => $cache_file,
560 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200561 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200562);
563
Akron03b24db2016-08-16 20:54:32 +0200564# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200565my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200566 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200567 meta_type => $meta,
568 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200569 foundry => $token_base_foundry,
570 layer => $token_base_layer,
571 gzip => $gzip,
572 log => $log,
Akron263274c2019-02-07 09:48:30 +0100573 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200574 primary => $primary,
575 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100576 anno => \@filtered_anno,
577 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200578);
579
Akron941c1a62016-02-23 17:41:41 +0100580# Get file name based on path information
581sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100582 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200583 if (-d $i) {
584 $i =~ s![^\/]+$!!;
585 };
Akron941c1a62016-02-23 17:41:41 +0100586 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200587
588 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200589 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100590 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100591 $file =~ tr/\//-/;
592 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200593 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100594 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000595};
596
Akron63f20d42017-04-10 23:40:29 +0200597
598sub get_file_name_from_glob ($) {
599 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200600 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200601 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
602 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
603 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
604 $glob =~ s/^-//; # Clean beginning
605 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200606 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200607 return $glob;
608};
609
610
Akrone10ad322016-02-27 10:54:26 +0100611# Convert sigle to path construct
612s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
613
Akron7d4cdd82016-08-17 21:39:45 +0200614if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200615 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200616 $log->error("Directory '$output' does not exist.");
617 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200618 };
619};
620
Akron63f20d42017-04-10 23:40:29 +0200621
622# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200623if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200624
Akron821db3d2017-04-06 21:19:31 +0200625 my @new_input = ();
626
627 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200628 foreach my $wild_card (@input) {
629
630 # Prefix with input root
631 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
632
633 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200634 };
635
Akron63f20d42017-04-10 23:40:29 +0200636 # Sort files by length
637 @input = sort { length($a) <=> length($b) } @new_input;
638
639 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200640};
641
642
Akron941c1a62016-02-23 17:41:41 +0100643# Process a single file
644unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100645 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000646
Akron941c1a62016-02-23 17:41:41 +0100647 BEGIN {
648 $main::TIME = Benchmark->new;
649 $main::LAST_STOP = Benchmark->new;
650 };
651
652 sub stop_time {
653 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200654 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100655 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200656 timestr(timediff($new, $main::LAST_STOP)) .
657 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
658 );
Akron941c1a62016-02-23 17:41:41 +0100659 $main::LAST_STOP = $new;
660 };
661
662 # Create and parse new document
663 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100664
Akron7d4cdd82016-08-17 21:39:45 +0200665 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200666 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100667
Akron11c80302016-03-18 19:44:43 +0100668 # Delete cache file
669 unlink($cache_file) if $cache_delete;
670
Akron5f51d422016-08-16 16:26:43 +0200671 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200672 exit;
Akron81500102017-04-07 20:45:44 +0200673};
674
Nils Diewald59094f22014-11-05 18:20:50 +0000675
Akrone10ad322016-02-27 10:54:26 +0100676# Extract XML files
Akron81500102017-04-07 20:45:44 +0200677if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100678
Akrond5643ad2017-07-04 20:27:13 +0200679 # Output is required
680 pod2usage(%ERROR_HASH) unless $output;
681
Akron7d4cdd82016-08-17 21:39:45 +0200682 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200683 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100684
Akron7d4cdd82016-08-17 21:39:45 +0200685 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100686 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200687 $log->error("Unzip is not installed or incompatible.");
688 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100689 };
690
Akronb0c88db2016-06-29 16:33:18 +0200691 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200692 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200693
Akron651cb8d2016-08-16 21:44:49 +0200694 my $prefix = 1;
695
Akron03b24db2016-08-16 20:54:32 +0200696 # No sigles given
697 unless (@sigle) {
698
699 # Get files
700 foreach ($archive->list_texts) {
701
702 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200703 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200704
705 # TODO: Make this OS independent
706 push @sigle, join '/', $corpus, $doc, $text;
707 };
Akron20807582016-10-26 17:11:34 +0200708 }
709
710 # Check sigle for doc sigles
711 else {
712 my @new_sigle;
713
714 my $prefix_check = 0;
715
716 # Iterate over all sigle
717 foreach (@sigle) {
718
719 # Sigle is a doc sigle
720 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200721
Akron60a8caa2017-02-17 21:51:27 +0100722 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200723 # Check if a prefix is needed
724 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100725
726 if ($prefix = $archive->check_prefix) {
727 print " with prefix ...";
728 };
Akron20807582016-10-26 17:11:34 +0200729 $prefix_check = 1;
730 };
731
Akron60a8caa2017-02-17 21:51:27 +0100732 print "\n";
733
Akron20807582016-10-26 17:11:34 +0200734 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200735 my $path = ($prefix ? './' : '') . $_;
736
737 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200738 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200739 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200740 ) ? '' : 'not '
741 );
742 print "extracted.\n";
743 }
Akron60a8caa2017-02-17 21:51:27 +0100744
745 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200746 else {
747 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100748
749 unless ($prefix_check) {
750
751 if ($prefix = $archive->check_prefix) {
752 print " with prefix ...";
753 };
754 $prefix_check = 1;
755 };
Akron20807582016-10-26 17:11:34 +0200756 };
757 };
758 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200759 };
760
Akrone10ad322016-02-27 10:54:26 +0100761 # Iterate over all given sigles and extract
762 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100763
Akron2812ba22016-10-28 21:55:59 +0200764 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200765
Akron03b24db2016-08-16 20:54:32 +0200766 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200767 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100768
Akron20807582016-10-26 17:11:34 +0200769 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200770 ($prefix ? './' : '') . $_, $output
771 ) ? '' : 'not '
772 );
Akrone10ad322016-02-27 10:54:26 +0100773 print "extracted.\n";
774 };
Akronb0c88db2016-06-29 16:33:18 +0200775 }
Akron7d4cdd82016-08-17 21:39:45 +0200776
777 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200778 else {
779 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200780 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100781 };
782}
783
Akron81500102017-04-07 20:45:44 +0200784
Akron941c1a62016-02-23 17:41:41 +0100785# Process an archive
786elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000787
Akron81500102017-04-07 20:45:44 +0200788 my $archive_output;
789
790 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100791 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200792
793 # Create new archive object
794 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
795
796 # Check zip capabilities
797 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200798 $log->error("Unzip is not installed or incompatible.");
799 exit 1;
Akron81500102017-04-07 20:45:44 +0200800 };
801
802 # Add further annotation archived
803 $archive->attach($_) foreach @input[1..$#input];
804
805 # Create a temporary directory
806 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200807 $extract_dir = tempdir(CLEANUP => 0);
808 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200809 };
810
Akron63f20d42017-04-10 23:40:29 +0200811 # Add some random extra to avoid clashes with multiple archives
812 $extract_dir = catdir($extract_dir, random_string('cccccc'));
813
814 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200815 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200816 @input = ($extract_dir);
817 }
818 else {
819 $log->error('Unable to extract from primary archive ' . $input[0] .
820 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200821 exit 1;
Akron81500102017-04-07 20:45:44 +0200822 };
823 }
824
825 # Can't create archive object
826 else {
827 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200828 exit 1;
Akron81500102017-04-07 20:45:44 +0200829 };
830 };
831
Akrone1dbc382016-07-08 22:24:52 +0200832 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100833
Akron7d4cdd82016-08-17 21:39:45 +0200834 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100835 my $pool = Parallel::ForkManager->new($jobs);
836
Akron7d4cdd82016-08-17 21:39:45 +0200837 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100838 my $iter = 1; # Current text in process
839
Akronda3097e2017-04-23 19:53:57 +0200840 my $tar_archive;
841 my $output_dir = $output;
842 my $tar_fh;
843
844 # Initialize tar archive
845 if ($to_tar) {
846 $tar_archive = Archive::Tar::Builder->new(
847 ignore_errors => 1
848 );
849
850 # Set output name
851 my $tar_file = $output;
852 unless ($tar_file =~ /\.tar$/) {
853 $tar_file .= '.tar';
854 };
855
856 # Initiate the tar file
857 print "Writing to file $tar_file\n";
858 $tar_fh = IO::File->new($tar_file, 'w');
859 $tar_fh->binmode(1);
860
861 # Set handle
862 $tar_archive->set_handle($tar_fh);
863
864 # Output to temporary directory
865 $output_dir = File::Temp->newdir;
866 };
867
Akron941c1a62016-02-23 17:41:41 +0100868 # Report on fork message
869 $pool->run_on_finish (
870 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200871 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100872 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200873
Akron08385f62016-03-22 20:37:04 +0100874 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200875 ($iter++) . "/$count]" .
876 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200877 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200878
879 if (!$code && $to_tar && $data->[2]) {
880 my $filename = $data->[2];
881
882 # Lock filehandle
883 if (flock($tar_fh, LOCK_EX)) {
884
Akron9a062ce2017-07-04 19:12:05 +0200885 my $clean_file = fileparse($filename);
886
Akronda3097e2017-04-23 19:53:57 +0200887 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200888 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200889 unlink $filename;
890
891 # Unlock filehandle
892 flock($tar_fh, LOCK_UN);
893 }
894 else {
895 $log->warn("Unable to add $filename to archive");
896 };
897 };
898
Akron4c0cf312016-10-15 16:42:09 +0200899 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100900 }
901 );
902
903 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200904 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100905 print "Reading data ...\n";
906
Akron7d4cdd82016-08-17 21:39:45 +0200907 # unless (Cache::FastMmap->new(
908 # share_file => $cache_file,
909 # cache_size => $cache_size,
910 # init_file => $cache_init
911 # )) {
912 # print "Unable to intialize cache '$cache_file'\n\n";
913 # exit(1);
914 # };
Akron11c80302016-03-18 19:44:43 +0100915
Akron486f9ab2017-04-22 23:25:19 +0200916
Akron941c1a62016-02-23 17:41:41 +0100917 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100918 if (-d $input[0]) {
919 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100920 my @dirs;
921 my $dir;
922
Akron7d4cdd82016-08-17 21:39:45 +0200923 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100924 while (1) {
925 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200926 push @dirs, $dir;
927 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100928 };
929 last unless $it->next;
930 };
931
932 print "Start processing ...\n";
933 $t = Benchmark->new;
934 $count = scalar @dirs;
935
936 DIRECTORY_LOOP:
937 for (my $i = 0; $i < $count; $i++) {
938
Akrone1dbc382016-07-08 22:24:52 +0200939 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200940 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200941 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200942 );
Akron941c1a62016-02-23 17:41:41 +0100943
944 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200945 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200946
Akron13d56622016-10-31 14:54:49 +0100947 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200948 $pool->finish(
949 0,
Akronda3097e2017-04-23 19:53:57 +0200950 [
951 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
952 undef,
953 $filename
954 ]
Akron486f9ab2017-04-22 23:25:19 +0200955 );
Akron3ec48972016-08-17 23:24:52 +0200956 }
957 else {
Akron4c0cf312016-10-15 16:42:09 +0200958 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200959 };
Akron941c1a62016-02-23 17:41:41 +0100960 };
961 }
962
963 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200964 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200965
Akron941c1a62016-02-23 17:41:41 +0100966 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200967 $log->error("Unzip is not installed or incompatible.");
968 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100969 };
970
Akron08385f62016-03-22 20:37:04 +0100971 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200972 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100973
Akron941c1a62016-02-23 17:41:41 +0100974 print "Start processing ...\n";
975 $t = Benchmark->new;
976 my @dirs = $archive->list_texts;
977 $count = scalar @dirs;
978
979 ARCHIVE_LOOP:
980 for (my $i = 0; $i < $count; $i++) {
981
982 # Split path information
983 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
984
Akrone1dbc382016-07-08 22:24:52 +0200985 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200986 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200987 get_file_name(
988 catfile($corpus, $doc, $text)
989 . '.json' . ($gzip ? '.gz' : '')
990 )
Akrone1dbc382016-07-08 22:24:52 +0200991 );
Akron941c1a62016-02-23 17:41:41 +0100992
993 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200994 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100995
Akron4c0cf312016-10-15 16:42:09 +0200996 # Create temporary file
997 $temp = File::Temp->newdir;
998
Akronbdf434a2016-10-24 17:42:07 +0200999 # TODO: Check if $filename exist at the beginning,
1000 # because extraction can be horrible slow!
1001
Akron941c1a62016-02-23 17:41:41 +01001002 # Extract from archive
Akron20807582016-10-26 17:11:34 +02001003 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +01001004
Akron7d4cdd82016-08-17 21:39:45 +02001005 # Create corpus directory
1006 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +01001007
Akron7d4cdd82016-08-17 21:39:45 +02001008 # Temporary directory
1009 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +01001010
Akron7d4cdd82016-08-17 21:39:45 +02001011 # Write file
Akron13d56622016-10-31 14:54:49 +01001012 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +02001013
Akron4c0cf312016-10-15 16:42:09 +02001014 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +01001015 $pool->finish(
1016 0,
Akronda3097e2017-04-23 19:53:57 +02001017 [
1018 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
1019 $temp,
1020 $filename
1021 ]
Akron13d56622016-10-31 14:54:49 +01001022 );
1023 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001024 }
1025 else {
Akron4c0cf312016-10-15 16:42:09 +02001026 # Delete temporary file
1027 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001028 };
Akron941c1a62016-02-23 17:41:41 +01001029 }
Akron7d4cdd82016-08-17 21:39:45 +02001030
1031 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001032 else {
Akron4c0cf312016-10-15 16:42:09 +02001033 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001034 };
1035 };
1036 }
1037
1038 else {
1039 print "Input is neither a directory nor an archive.\n\n";
1040 };
1041
1042 $pool->wait_all_children;
1043
Akron11c80302016-03-18 19:44:43 +01001044 # Delete cache file
1045 unlink($cache_file) if $cache_delete;
1046
Akronda3097e2017-04-23 19:53:57 +02001047 # Close tar filehandle
1048 if ($to_tar && $tar_fh) {
1049 $tar_archive->finish;
1050 $tar_fh->close;
1051 print "Wrote to tar archive.\n";
1052 };
1053
Akron63f20d42017-04-10 23:40:29 +02001054 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001055 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001056};
Akron941c1a62016-02-23 17:41:41 +01001057
Nils Diewald2db9ad02013-10-29 19:26:43 +00001058
Akron63f20d42017-04-10 23:40:29 +02001059# Cleanup temporary extraction directory
1060if ($extract_dir) {
1061 my $objects = remove_tree($extract_dir, { safe => 1 });
1062 print "Removed directory $extract_dir with $objects objects.\n";
1063};
1064
1065
1066print "\n";
1067
Nils Diewald2db9ad02013-10-29 19:26:43 +00001068__END__
Akron941c1a62016-02-23 17:41:41 +01001069
1070=pod
1071
1072=encoding utf8
1073
1074=head1 NAME
1075
Akronf7ad89e2016-03-16 18:22:47 +01001076korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001077
1078
1079=head1 SYNOPSIS
1080
Akrona76d8352016-10-27 16:27:32 +02001081 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001082
Akron2fd402b2016-10-27 21:26:48 +02001083
Akron941c1a62016-02-23 17:41:41 +01001084=head1 DESCRIPTION
1085
1086L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1087compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001088The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001089
1090
1091=head1 INSTALLATION
1092
1093The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1094
Akronaf386982016-10-12 00:33:25 +02001095 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001096
Akronc13a1702016-03-15 19:33:14 +01001097In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001098be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001099Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001100In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001101
1102=head1 ARGUMENTS
1103
Akrona76d8352016-10-27 16:27:32 +02001104 $ korapxml2krill -z --input <directory> --output <filename>
1105
1106Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001107It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001108
Akron941c1a62016-02-23 17:41:41 +01001109=over 2
1110
1111=item B<archive>
1112
Akron081639e2017-04-21 19:01:39 +02001113 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001114
Akron2fd402b2016-10-27 21:26:48 +02001115Converts an archive of KorAP-XML documents. It expects a directory
1116(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001117
1118=item B<extract>
1119
Akrona76d8352016-10-27 16:27:32 +02001120 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1121
1122Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001123
Akron63f20d42017-04-10 23:40:29 +02001124=item B<serial>
1125
1126 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1127
1128Convert archives sequentially. The inputs are not merged but treated
1129as they are (so they may be premerged or globs).
1130the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001131are created based on the archive name. In case the C<--to-tar> flag is given,
1132the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001133
1134
Akron941c1a62016-02-23 17:41:41 +01001135=back
1136
1137
1138=head1 OPTIONS
1139
1140=over 2
1141
Akrona76d8352016-10-27 16:27:32 +02001142=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001143
Akrona76d8352016-10-27 16:27:32 +02001144Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001145
Akron7606afa2016-10-25 16:23:49 +02001146Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001147document, while C<archive> expects a KorAP-XML corpus folder or a zip
1148file to batch process multiple files.
1149C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001150
Akrona76d8352016-10-27 16:27:32 +02001151C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001152that the first archive listed contains all primary data files
1153and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001154
Akron7606afa2016-10-25 16:23:49 +02001155 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001156
Akron821db3d2017-04-06 21:19:31 +02001157Input may also be defined using BSD glob wildcards.
1158
1159 -i 'file/news*.zip'
1160
1161The extended input array will be sorted in length order, so the shortest
1162path needs to contain all primary data files and all meta data files.
1163
Akron0c3e3752016-06-28 15:55:53 +02001164(The directory structure follows the base directory format,
1165that may include a C<.> root folder.
1166In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001167need to be passed with a hash sign in front of the archive's name.
1168This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001169
Akron7606afa2016-10-25 16:23:49 +02001170To support zip files, a version of C<unzip> needs to be installed that is
1171compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001172
Akron7606afa2016-10-25 16:23:49 +02001173B<The root folder switch using the hash sign is experimental and
1174may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001175
Akronf73ffb62018-06-27 12:13:59 +02001176
Akron63f20d42017-04-10 23:40:29 +02001177=item B<--input-base|-ib> <directory>
1178
1179The base directory for inputs.
1180
1181
Akron941c1a62016-02-23 17:41:41 +01001182=item B<--output|-o> <directory|file>
1183
1184Output folder for archive processing or
1185document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001186writes to C<STDOUT> by default
1187(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001188
1189=item B<--overwrite|-w>
1190
1191Overwrite files that already exist.
1192
Akronf73ffb62018-06-27 12:13:59 +02001193
Akron3741f8b2016-12-21 19:55:21 +01001194=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001195
1196Define the default tokenization by specifying
1197the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001198of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001199
Akron3741f8b2016-12-21 19:55:21 +01001200
1201=item B<--base-sentences|-bs> <foundry>#<layer>
1202
1203Define the layer for base sentences.
1204If given, this will be used instead of using C<Base#Sentences>.
1205Currently C<DeReKo#Structure> is the only additional layer supported.
1206
1207 Defaults to unset.
1208
1209
1210=item B<--base-paragraphs|-bp> <foundry>#<layer>
1211
1212Define the layer for base paragraphs.
1213If given, this will be used instead of using C<Base#Paragraphs>.
1214Currently C<DeReKo#Structure> is the only additional layer supported.
1215
1216 Defaults to unset.
1217
1218
Akron41ac10b2017-02-08 22:47:25 +01001219=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1220
1221Define the layer for base pagebreaks.
1222Currently C<DeReKo#Structure> is the only layer supported.
1223
1224 Defaults to unset.
1225
1226
Akron941c1a62016-02-23 17:41:41 +01001227=item B<--skip|-s> <foundry>[#<layer>]
1228
Akronf7ad89e2016-03-16 18:22:47 +01001229Skip specific annotations by specifying the foundry
1230(and optionally the layer with a C<#>-prefix),
1231e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001232Can be set multiple times.
1233
Akronf73ffb62018-06-27 12:13:59 +02001234
Akronc13a1702016-03-15 19:33:14 +01001235=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001236
Akronf7ad89e2016-03-16 18:22:47 +01001237Convert specific annotations by specifying the foundry
1238(and optionally the layer with a C<#>-prefix),
1239e.g. C<Mate> or C<Mate#Morpho>.
1240Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001241
Akronf73ffb62018-06-27 12:13:59 +02001242
Akron941c1a62016-02-23 17:41:41 +01001243=item B<--primary|-p>
1244
Akronc13a1702016-03-15 19:33:14 +01001245Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001246Can be flagged using C<--no-primary> as well.
1247This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001248
Akronf73ffb62018-06-27 12:13:59 +02001249
Akroned9baf02019-01-22 17:03:25 +01001250=item B<--non-word-tokens|-nwt>
1251
1252Tokenize non-word tokens like word tokens (defined as matching
1253C</[\d\w]/>). Useful to treat punctuations as tokens.
1254
1255 Defaults to unset.
1256
Akron941c1a62016-02-23 17:41:41 +01001257=item B<--jobs|-j>
1258
1259Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001260for archive processing.
Akron11c80302016-03-18 19:44:43 +01001261Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001262
1263If C<sequential-extraction> is not set to false, this will
1264also apply to extraction.
1265
Akronc11f7982017-02-21 21:20:14 +01001266Pass -1, and the value will be set automatically to 5
1267times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001268This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001269
Akronf73ffb62018-06-27 12:13:59 +02001270
Akron263274c2019-02-07 09:48:30 +01001271=item B<--koral|-k>
1272
1273Version of the output format. Supported versions are:
1274C<0> for legacy serialization, C<0.03> for serialization
1275with metadata fields as key-values on the root object,
1276C<0.4> for serialization with metadata fields as a list
1277of C<"@type":"koral:field"> objects.
1278
1279Currently defaults to C<0.03>.
1280
1281
Akron9ec88872017-04-12 16:29:06 +02001282=item B<--sequential-extraction|-se>
1283
1284Flag to indicate, if the C<jobs> value also applies to extraction.
1285Some systems may have problems with extracting multiple archives
1286to the same folder at the same time.
1287Can be flagged using C<--no-sequential-extraction> as well.
1288Defaults to C<false>.
1289
Akronf73ffb62018-06-27 12:13:59 +02001290
Akron35db6e32016-03-17 22:42:22 +01001291=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001292
Akron35db6e32016-03-17 22:42:22 +01001293Define the metadata parser to use. Defaults to C<I5>.
1294Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1295This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001296
Akronf73ffb62018-06-27 12:13:59 +02001297
Akron941c1a62016-02-23 17:41:41 +01001298=item B<--pretty|-y>
1299
Akronc13a1702016-03-15 19:33:14 +01001300Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001301This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001302
Akronf73ffb62018-06-27 12:13:59 +02001303
Akron941c1a62016-02-23 17:41:41 +01001304=item B<--gzip|-z>
1305
Akronf7ad89e2016-03-16 18:22:47 +01001306Compress the output.
1307Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001308
Akronf73ffb62018-06-27 12:13:59 +02001309
Akron11c80302016-03-18 19:44:43 +01001310=item B<--cache|-c>
1311
1312File to mmap a cache (using L<Cache::FastMmap>).
1313Defaults to C<korapxml2krill.cache> in the calling directory.
1314
Akronf73ffb62018-06-27 12:13:59 +02001315
Akron11c80302016-03-18 19:44:43 +01001316=item B<--cache-size|-cs>
1317
1318Size of the cache. Defaults to C<50m>.
1319
Akronf73ffb62018-06-27 12:13:59 +02001320
Akron11c80302016-03-18 19:44:43 +01001321=item B<--cache-init|-ci>
1322
1323Initialize cache file.
1324Can be flagged using C<--no-cache-init> as well.
1325Defaults to C<true>.
1326
Akronf73ffb62018-06-27 12:13:59 +02001327
Akron11c80302016-03-18 19:44:43 +01001328=item B<--cache-delete|-cd>
1329
1330Delete cache file after processing.
1331Can be flagged using C<--no-cache-delete> as well.
1332Defaults to C<true>.
1333
Akronf73ffb62018-06-27 12:13:59 +02001334
Akron636aa112017-04-07 18:48:56 +02001335=item B<--config|-cfg>
1336
1337Configure the parameters of your call in a file
1338of key-value pairs with whitespace separator
1339
1340 overwrite 1
1341 token DeReKo#Structure
1342 ...
1343
1344Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001345C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001346C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001347C<output>,
1348C<temp-extract>, C<sequential-extraction>,
1349C<base-sentences>, C<base-paragraphs>,
1350C<base-pagebreaks>,
1351C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001352(semicolon separated), C<anno> (semicolon separated).
1353
Akronf73ffb62018-06-27 12:13:59 +02001354Configuration parameters will always be overwritten by
1355passed parameters.
1356
1357
Akron81500102017-04-07 20:45:44 +02001358=item B<--temporary-extract|-te>
1359
1360Only valid for the C<archive> command.
1361
1362This will first extract all files into a
1363directory and then will archive.
1364If the directory is given as C<:temp:>,
1365a temporary directory is used.
1366This is especially useful to avoid
1367massive unzipping and potential
1368network latency.
Akron636aa112017-04-07 18:48:56 +02001369
Akronf73ffb62018-06-27 12:13:59 +02001370
Akrone10ad322016-02-27 10:54:26 +01001371=item B<--sigle|-sg>
1372
Akron20807582016-10-26 17:11:34 +02001373Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001374Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001375I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001376Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001377In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001378On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001379
Akronf73ffb62018-06-27 12:13:59 +02001380
Akron941c1a62016-02-23 17:41:41 +01001381=item B<--log|-l>
1382
1383The L<Log4perl> log level, defaults to C<ERROR>.
1384
Akronf73ffb62018-06-27 12:13:59 +02001385
Akron941c1a62016-02-23 17:41:41 +01001386=item B<--help|-h>
1387
1388Print this document.
1389
Akronf73ffb62018-06-27 12:13:59 +02001390
Akron941c1a62016-02-23 17:41:41 +01001391=item B<--version|-v>
1392
1393Print version information.
1394
1395=back
1396
Akronf73ffb62018-06-27 12:13:59 +02001397
Akronc13a1702016-03-15 19:33:14 +01001398=head1 ANNOTATION SUPPORT
1399
1400L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1401developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1402The base foundry with paragraphs, sentences, and the text element are mandatory for
1403L<Krill|https://github.com/KorAP/Krill>.
1404
Akron821db3d2017-04-06 21:19:31 +02001405 Base
1406 #Paragraphs
1407 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001408
Akron821db3d2017-04-06 21:19:31 +02001409 Connexor
1410 #Morpho
1411 #Phrase
1412 #Sentences
1413 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001414
Akron821db3d2017-04-06 21:19:31 +02001415 CoreNLP
1416 #Constituency
1417 #Morpho
1418 #NamedEntities
1419 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001420
Akronce125b62017-06-19 11:54:36 +02001421 CMC
1422 #Morpho
1423
Akron821db3d2017-04-06 21:19:31 +02001424 DeReKo
1425 #Structure
Akronc13a1702016-03-15 19:33:14 +01001426
Akron821db3d2017-04-06 21:19:31 +02001427 DRuKoLa
1428 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001429
Akron821db3d2017-04-06 21:19:31 +02001430 Glemm
1431 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001432
Akronea1aed52018-07-19 14:43:34 +02001433 HNC
1434 #Morpho
1435
Akron4c679192018-01-16 17:41:49 +01001436 LWC
1437 #Dependency
1438
Akron821db3d2017-04-06 21:19:31 +02001439 Malt
1440 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001441
Akron821db3d2017-04-06 21:19:31 +02001442 MarMoT
1443 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001444
Akron821db3d2017-04-06 21:19:31 +02001445 Mate
1446 #Dependency
1447 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001448
Akron821db3d2017-04-06 21:19:31 +02001449 MDParser
1450 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001451
Akron821db3d2017-04-06 21:19:31 +02001452 OpenNLP
1453 #Morpho
1454 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001455
Akron821db3d2017-04-06 21:19:31 +02001456 Sgbr
1457 #Lemma
1458 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001459
Akron821db3d2017-04-06 21:19:31 +02001460 TreeTagger
1461 #Morpho
1462 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001463
Akron821db3d2017-04-06 21:19:31 +02001464 XIP
1465 #Constituency
1466 #Morpho
1467 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001468
Akronc13a1702016-03-15 19:33:14 +01001469
1470More importers are in preparation.
1471New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1472See the built-in annotation importers as examples.
1473
Akronf73ffb62018-06-27 12:13:59 +02001474
Akron941c1a62016-02-23 17:41:41 +01001475=head1 AVAILABILITY
1476
1477 https://github.com/KorAP/KorAP-XML-Krill
1478
1479
1480=head1 COPYRIGHT AND LICENSE
1481
Akroned9baf02019-01-22 17:03:25 +01001482Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001483
Akron941c1a62016-02-23 17:41:41 +01001484Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001485
Akrona76d8352016-10-27 16:27:32 +02001486Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001487
1488L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1489Corpus Analysis Platform at the
1490L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1491member of the
1492L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1493
1494This program is free software published under the
1495L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1496
1497=cut