blob: cc0490a4f65c78e83394826347f862909722aa8c [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron7d4d2d72024-09-05 11:05:35 +02004use v5.32;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010018use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010019use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020020use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020021use Config::Simple;
Akron0a0d1f92024-11-14 14:31:42 +010022use Path::Iterator::Rule;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akroncb12af72025-07-15 14:36:10 +020025use File::Temp qw/tempdir tempfile/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Marc Kupietzb8c53822024-03-16 18:54:08 +0100174# 2024/03/20
175# - Added Spacy support.
176#
Akronebbac2e2024-03-22 10:31:23 +0100177# 2024/03/22
178# - Improve core count logging.
Akron941c1a62016-02-23 17:41:41 +0100179# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100180
Akroncb12af72025-07-15 14:36:10 +0200181our $LAST_CHANGE = '2025/07/15';
Akron941c1a62016-02-23 17:41:41 +0100182our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100183our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100184our $VERSION_MSG = <<"VERSION";
185Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
186VERSION
187
Akron941c1a62016-02-23 17:41:41 +0100188# Parse comand
189my $cmd;
190our @ARGV;
191if ($ARGV[0] && index($ARGV[0], '-') != 0) {
192 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100193};
Akron63f20d42017-04-10 23:40:29 +0200194my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100195
Akron5f51d422016-08-16 16:26:43 +0200196my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200197
198# Configuration hash
199my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100200
Akronebbac2e2024-03-22 10:31:23 +0100201# Count jobs/cores if not set
202sub count_jobs {
203 my ($cores, $jobs);
204 my $msg = 'Unable to determine number of cores - set to 1';
205 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
206 $cores = Sys::Info->new->device('CPU')->count;
207 if ($cores <= 0) {
208 $log->error($msg);
209 $cores = 1;
210 }
211 }
212 else {
213 $log->error($msg);
214 $cores = 1;
215 };
216
217 $jobs = ceil(5 * $cores);
218 return $jobs, "Run using $jobs jobs on $cores cores";
219}
220
Akron941c1a62016-02-23 17:41:41 +0100221# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000222GetOptions(
Akron08385f62016-03-22 20:37:04 +0100223 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200224 'input-base|ib=s' => \($cfg{input_base}),
225 'output|o=s' => \($cfg{output}),
226 'overwrite|w' => \($cfg{overwrite}),
227 'meta|m=s' => \($cfg{meta}),
228 'token|t=s' => \($cfg{token}),
229 'base-sentences|bs=s' => \($cfg{base_sentences}),
230 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
231 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
232 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100233 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100234 'skip|s=s' => \@skip,
235 'sigle|sg=s' => \@sigle,
Akronc0ac4ff2024-04-15 18:03:15 +0200236 'cache|c=s' => \($cfg{cache}),
Akron636aa112017-04-07 18:48:56 +0200237 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200238 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200239 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200240 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200241 'primary|p!' => sub {
242 warn 'Primary flag no longer supported!';
243 },
Akrona3518372024-01-22 23:29:00 +0100244 'quiet' => \($cfg{quiet}),
Akron6aed0562020-08-07 16:46:00 +0200245 'pretty|y' => sub {
246 warn 'Pretty flag no longer supported!';
247 },
Akronf8df2162020-08-07 15:03:39 +0200248 'jobs|j=i' => \($cfg{jobs}),
249 'koral|k=f' => \($cfg{koral}),
250 'to-tar' => \($cfg{to_tar}),
251 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
252 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
253 'sequential-extraction|se' => \($cfg{sequential_extraction}),
254 'cache-size|cs=s' => \($cfg{cache_size}),
255 'cache-delete|cd!' => \($cfg{cache_delete}),
256 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100257 'help|h' => sub {
258 pod2usage(
259 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200260 -verbose => 99,
261 -msg => $VERSION_MSG,
262 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100263 );
264 },
265 'version|v' => sub {
266 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200267 -verbose => 0,
268 -msg => $VERSION_MSG,
269 -output => '-'
Akronebbac2e2024-03-22 10:31:23 +0100270 ),
271 },
272 'job-count|jc' => sub {
273 my ($j, $msg) = count_jobs();
274 pod2usage(
275 -verbose => 0,
276 -msg => $msg,
277 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100278 )
279 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000280);
281
Akrone512b7c2020-08-07 16:16:12 +0200282my %ERROR_HASH = (
283 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
284 -verbose => 99,
285 -msg => $VERSION_MSG,
286 -output => '-',
287 -exit => 1
288);
Akron63f20d42017-04-10 23:40:29 +0200289
Akronf8df2162020-08-07 15:03:39 +0200290# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200291if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200292 my %config;
293
Akronf8df2162020-08-07 15:03:39 +0200294 print "Reading config from $cfg_file\n";
295
Akron636aa112017-04-07 18:48:56 +0200296 Config::Simple->import_from($cfg_file, \%config);
297
Akronf8df2162020-08-07 15:03:39 +0200298 foreach (qw!output cache-size input-base token overwrite
299 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200300 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100301 non-verbal-tokens sequential-extraction
Akronc0ac4ff2024-04-15 18:03:15 +0200302 temporary-extract cache-init cache-delete
Akrona3518372024-01-22 23:29:00 +0100303 koral extract-dir jobs quiet!) {
Akronf8df2162020-08-07 15:03:39 +0200304 my $underlined = $_ =~ tr/-/_/r;
305 if (!defined($cfg{$underlined}) && defined $config{$_}) {
306 $cfg{$underlined} = $config{$_};
307 };
Akron636aa112017-04-07 18:48:56 +0200308 };
309
310 # Skip
311 if (!scalar(@skip) && defined $config{'skip'}) {
312 @skip = split /\s*;\s*/, $config{'skip'} ;
313 };
314
315 # Sigle
316 if (!scalar(@sigle) && defined $config{'sigle'}) {
317 @sigle = split /\s*;\s*/, $config{'sigle'} ;
318 };
319
320 # Anno
321 if (!scalar(@anno) && defined $config{'anno'}) {
322 @anno = split /\s*;\s*/, $config{'anno'} ;
323 };
324};
325
Akronf8df2162020-08-07 15:03:39 +0200326# Init variables and set default values
327my $output = $cfg{output};
328my $input_base = $cfg{input_base};
329my $gzip = $cfg{gzip};
330my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100331my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200332my $token_base = $cfg{token} // 'OpenNLP#tokens';
333my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
334my $jobs = $cfg{jobs} // 0;
335my $cache_delete = $cfg{cache_delete} // 1;
336my $base_sentences = lc($cfg{base_sentences} // '');
337my $base_paragraphs = lc($cfg{base_paragraphs} // '');
338my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
339my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akrona3518372024-01-22 23:29:00 +0100340my $q = !!($cfg{quiet}) // 0;
Akron63f20d42017-04-10 23:40:29 +0200341
Akronf8df2162020-08-07 15:03:39 +0200342# Get tokenization basis
343my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200344
Akronf8df2162020-08-07 15:03:39 +0200345# Remove file extension
346$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100347
Akronf8df2162020-08-07 15:03:39 +0200348# Convert sigle to path construct
349s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
350
351my %skip;
352$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200353
Akronb9c33812020-10-21 16:19:35 +0200354Log::Any::Adapter->set(
355 'Stderr', log_level => uc($cfg{log} // 'ERROR')
356);
Akron63f20d42017-04-10 23:40:29 +0200357
Akron84b53ad2022-01-14 12:39:15 +0100358# Start log slimming
359if ($cmd && $cmd eq 'slimlog') {
360 require KorAP::XML::Log::Slim;
361
362 my $log_file = shift @ARGV;
363
364 if (-e $log_file) {
365
366 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
367
368 # Run log filter
369 $slimmer->slim_to;
370 }
371
372 else {
373 warn "Log file can't be found";
374 exit(1);
375 };
376
377 exit;
378};
379
380
Akronf8df2162020-08-07 15:03:39 +0200381if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
382 $log->error("Directory '$output' does not exist.");
383 exit 1;
384};
Akron63f20d42017-04-10 23:40:29 +0200385
Akron941c1a62016-02-23 17:41:41 +0100386# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100387pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000388
Akrone1dbc382016-07-08 22:24:52 +0200389# Gzip has no effect, if no output is given
390pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000391
Akron63f20d42017-04-10 23:40:29 +0200392# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200393if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200394
Akron63f20d42017-04-10 23:40:29 +0200395 # Remove all inputs
396 my $remove_next = 0;
397 @keep_argv = @{c(@keep_argv)->grep(
398 sub {
399 # Input flag
400 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
401 $remove_next = 1;
402 return 0;
403 }
404
405 # input value
406 elsif ($remove_next) {
407 $remove_next = 0;
408 return 0;
409 };
410
411 # Pass parameter
412 return 1;
413 }
414 )->to_array};
415
416
417 # Iterate over all inputs
418 foreach (@input) {
419
Akron081639e2017-04-21 19:01:39 +0200420 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200421 my $new_out = catdir($output, get_file_name_from_glob($_));
422
Akron486f9ab2017-04-22 23:25:19 +0200423 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200424 unless ($to_tar) {
425 if (make_path($new_out) == 0 && !-d $new_out) {
426 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200427 exit 1;
Akron081639e2017-04-21 19:01:39 +0200428 };
Akron63f20d42017-04-10 23:40:29 +0200429 };
430
431 # Create archive command
432 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
Akrona3518372024-01-22 23:29:00 +0100433 unless ($q) {
434 print "Start serial processing of $_ to $new_out\n";
435 print 'Command: ', join(' ', @archive_cmd), "\n";
436 };
Akron63f20d42017-04-10 23:40:29 +0200437
438 # Start archiving
439 system @archive_cmd;
440 };
441
Akron3abc03e2017-06-29 16:23:35 +0200442 exit;
Akron63f20d42017-04-10 23:40:29 +0200443};
444
Akron5c602cb2020-08-07 17:00:52 +0200445# Define supported (and preinstalled) transformation modules
446my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100447push(@layers, ['Base', 'Sentences']) unless $base_sentences;
448push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200449
450# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200451push(@layers, ['Connexor', 'Morpho'],
452 ['Connexor', 'Syntax'],
453 ['Connexor', 'Phrase'],
454 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200455
456# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200457push(@layers,
458 ['CoreNLP', 'NamedEntities'],
459 ['CoreNLP', 'Sentences'],
460 ['CoreNLP', 'Morpho'],
461 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200462
Akron5530a552022-02-17 17:53:15 +0100463# CorpusExplorer
464push(@layers,
465 ['CorpusExplorer', 'Morpho']);
466
Akronce125b62017-06-19 11:54:36 +0200467# CMC
468push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100469
Akrone1dbc382016-07-08 22:24:52 +0200470# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100471my @dereko_attr = ();
472if ($base_sentences eq 'dereko#structure') {
473 push @dereko_attr, 'sentences';
474};
475if ($base_paragraphs eq 'dereko#structure') {
476 push @dereko_attr, 'paragraphs';
477};
Akron636bd9c2017-02-09 17:13:00 +0100478
Akron41ac10b2017-02-08 22:47:25 +0100479if ($base_pagebreaks eq 'dereko#structure') {
480 push @dereko_attr, 'pagebreaks';
481};
482
483if ($dereko_attr[0]) {
484 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100485}
486else {
487 push(@layers, ['DeReKo', 'Structure']);
488};
Akrone1dbc382016-07-08 22:24:52 +0200489
Akron57510c12019-01-04 14:58:53 +0100490# DGD
491push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100492if ($base_sentences eq 'dgd#structure') {
493 push(@layers, ['DGD', 'Structure', 'base-sentence']);
494}
Akron57510c12019-01-04 14:58:53 +0100495
496# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200497push(@layers,
498 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100499
Akronabb36902021-10-11 15:51:06 +0200500# Gingko
501push(@layers,
502 ['Gingko', 'Morpho']);
503
Akrone1dbc382016-07-08 22:24:52 +0200504# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200505push(@layers,
506 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200507
Akronea1aed52018-07-19 14:43:34 +0200508# HNC
Akron5c602cb2020-08-07 17:00:52 +0200509push(@layers,
510 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200511
Akron4c679192018-01-16 17:41:49 +0100512# LWC
Akron5c602cb2020-08-07 17:00:52 +0200513push(@layers,
514 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100515
Akrone1dbc382016-07-08 22:24:52 +0200516# Malt
Akron5c602cb2020-08-07 17:00:52 +0200517push(@layers,
518 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200519
Akron57510c12019-01-04 14:58:53 +0100520# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200521push(@layers,
522 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200523
524# Mate
Akron5c602cb2020-08-07 17:00:52 +0200525push(@layers,
526 ['Mate', 'Morpho'],
527 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200528
Akron57510c12019-01-04 14:58:53 +0100529# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200530push(@layers,
531 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100532
Akron88d063a2022-03-21 15:10:01 +0100533# NKJP
534push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200535 ['NKJP', 'Morpho'],
536 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100537
Akrone1dbc382016-07-08 22:24:52 +0200538# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200539push(@layers,
540 ['OpenNLP', 'Morpho'],
541 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200542
Akron07e24772020-04-23 14:00:54 +0200543# Redewiedergabe
544push(@layers, ['RWK', 'Morpho']);
545if ($base_sentences eq 'rwk#structure') {
546 push(@layers, ['RWK', 'Structure']);
547};
548
Akrone1dbc382016-07-08 22:24:52 +0200549# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200550push(@layers,
551 ['Sgbr', 'Lemma'],
552 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200553
Marc Kupietzb8c53822024-03-16 18:54:08 +0100554# Spacy
555push(@layers,
Marc Kupietz23446562025-10-28 14:36:50 +0100556 ['Spacy', 'Morpho'],
557 ['Spacy', 'Dependency']);
Marc Kupietzb8c53822024-03-16 18:54:08 +0100558
Akron7d5e6382019-08-08 16:36:27 +0200559# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200560push(@layers,
561 ['Talismane', 'Dependency'],
562 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200563
Akrone1dbc382016-07-08 22:24:52 +0200564# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200565push(@layers,
566 ['TreeTagger', 'Morpho'],
567 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200568
Marc Kupietz400590b2022-12-23 16:02:36 +0100569# UDPipe
570push(@layers,
571 ['UDPipe', 'Morpho'],
572 ['UDPipe', 'Dependency']);
573
Akrone1dbc382016-07-08 22:24:52 +0200574# XIP
Akron5c602cb2020-08-07 17:00:52 +0200575push(@layers,
576 ['XIP', 'Morpho'],
577 ['XIP', 'Constituency'],
578 ['XIP', 'Sentences'],
579 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200580
Akron4fa37c32017-01-20 14:43:10 +0100581
Akrone1dbc382016-07-08 22:24:52 +0200582# Check filters
583my @filtered_anno;
584if ($skip{'#all'}) {
585 foreach (@anno) {
586 push @filtered_anno, [ split('#', $_) ];
587 };
588}
589
590# Add all annotations that are not skipped
591else {
592 # Add to index file - respect skipping
593 foreach my $info (@layers) {
594 # Skip if Foundry or Foundry#Layer should be skipped
595 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
596 push @filtered_anno, $info;
597 };
598 };
599};
600
Akrone1dbc382016-07-08 22:24:52 +0200601
602# TODO: This should not be initialized for batch
603my $cache = Cache::FastMmap->new(
604 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200605 cache_size => ($cfg{cache_size} // '50m'),
Akronc0ac4ff2024-04-15 18:03:15 +0200606 init_file => ($cfg{cache_init} // 1),
607 unlink_on_exit => $cache_delete
Akrone1dbc382016-07-08 22:24:52 +0200608);
609
Akron03b24db2016-08-16 20:54:32 +0200610# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200611my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200612 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200613 meta_type => $cfg{meta},
614 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200615 foundry => $token_base_foundry,
616 layer => $token_base_layer,
617 gzip => $gzip,
618 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200619 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100620 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200621 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200622 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
623 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200624);
625
Akrone512b7c2020-08-07 16:16:12 +0200626# Auto adjust jobs
627if ($jobs eq '-1') {
Akronebbac2e2024-03-22 10:31:23 +0100628 ($jobs, my $msg) = count_jobs();
629 print $msg . "\n" unless $q;
Akrone512b7c2020-08-07 16:16:12 +0200630};
631
Akron63f20d42017-04-10 23:40:29 +0200632# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200633if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200634
Akron821db3d2017-04-06 21:19:31 +0200635 my @new_input = ();
636
637 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200638 foreach my $wild_card (@input) {
639
640 # Prefix with input root
641 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
642
643 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200644 };
645
Akron63f20d42017-04-10 23:40:29 +0200646 # Sort files by length
647 @input = sort { length($a) <=> length($b) } @new_input;
648
Akrona3518372024-01-22 23:29:00 +0100649 print 'Input is ' . join(', ', @input)."\n" unless $q;
Akron821db3d2017-04-06 21:19:31 +0200650};
651
Akron941c1a62016-02-23 17:41:41 +0100652# Process a single file
653unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100654 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000655
Akron941c1a62016-02-23 17:41:41 +0100656 BEGIN {
657 $main::TIME = Benchmark->new;
658 $main::LAST_STOP = Benchmark->new;
659 };
660
661 sub stop_time {
662 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200663 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100664 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200665 timestr(timediff($new, $main::LAST_STOP)) .
666 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
667 );
Akron941c1a62016-02-23 17:41:41 +0100668 $main::LAST_STOP = $new;
669 };
670
671 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200672 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100673
Akron7d4cdd82016-08-17 21:39:45 +0200674 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200675 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100676
Akron5f51d422016-08-16 16:26:43 +0200677 stop_time;
Akronc0ac4ff2024-04-15 18:03:15 +0200678
Akron3abc03e2017-06-29 16:23:35 +0200679 exit;
Akron81500102017-04-07 20:45:44 +0200680};
681
Nils Diewald59094f22014-11-05 18:20:50 +0000682
Akrone10ad322016-02-27 10:54:26 +0100683# Extract XML files
Akron81500102017-04-07 20:45:44 +0200684if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100685
Akrond5643ad2017-07-04 20:27:13 +0200686 # Output is required
687 pod2usage(%ERROR_HASH) unless $output;
688
Akron7d4cdd82016-08-17 21:39:45 +0200689 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200690 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100691
Akron7d4cdd82016-08-17 21:39:45 +0200692 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100693 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200694 $log->error("Unzip is not installed or incompatible.");
695 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100696 };
697
Akronb0c88db2016-06-29 16:33:18 +0200698 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200699 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200700
Akron31a08cb2019-02-20 20:43:26 +0100701 # Will set @sigle
702 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200703
Akrone10ad322016-02-27 10:54:26 +0100704 # Iterate over all given sigles and extract
705 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100706
Akrona3518372024-01-22 23:29:00 +0100707 unless ($q) {
708 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200709
Akrona3518372024-01-22 23:29:00 +0100710 # TODO: Make this OS independent
711 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100712
Akrona3518372024-01-22 23:29:00 +0100713 # TODO:
714 # - prefix???
715 $archive->extract_sigle(0, [$_], $output, $jobs)
716 ? '' : 'not '
717 );
718 print "extracted.\n";
719 } else {
Akroncb12af72025-07-15 14:36:10 +0200720 $archive->extract_sigle($q, [$_], $output, $jobs);
Akrona3518372024-01-22 23:29:00 +0100721 }
Akrone10ad322016-02-27 10:54:26 +0100722 };
Akronb0c88db2016-06-29 16:33:18 +0200723 }
Akron7d4cdd82016-08-17 21:39:45 +0200724
725 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200726 else {
727 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200728 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100729 };
730}
731
Akron81500102017-04-07 20:45:44 +0200732
Akron941c1a62016-02-23 17:41:41 +0100733# Process an archive
734elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000735
Akron81500102017-04-07 20:45:44 +0200736 my $archive_output;
737
738 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100739 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200740
741 # Create new archive object
742 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
743
744 # Check zip capabilities
745 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200746 $log->error("Unzip is not installed or incompatible.");
747 exit 1;
Akron81500102017-04-07 20:45:44 +0200748 };
749
750 # Add further annotation archived
751 $archive->attach($_) foreach @input[1..$#input];
752
753 # Create a temporary directory
754 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200755 $extract_dir = tempdir(CLEANUP => 0);
Akrona3518372024-01-22 23:29:00 +0100756 print "Temporarily extract to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200757 };
758
Akron63f20d42017-04-10 23:40:29 +0200759 # Add some random extra to avoid clashes with multiple archives
760 $extract_dir = catdir($extract_dir, random_string('cccccc'));
761
Akron31a08cb2019-02-20 20:43:26 +0100762 # Extract to temporary directory
Akrona3518372024-01-22 23:29:00 +0100763 if ($archive->extract_all($q, $extract_dir, $sequential_extraction ? 1: $jobs)) {
764 print "Extract sequentially to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200765 @input = ($extract_dir);
766 }
767 else {
768 $log->error('Unable to extract from primary archive ' . $input[0] .
769 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200770 exit 1;
Akron81500102017-04-07 20:45:44 +0200771 };
772 }
773
774 # Can't create archive object
775 else {
776 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200777 exit 1;
Akron81500102017-04-07 20:45:44 +0200778 };
779 };
780
Akron7d4cdd82016-08-17 21:39:45 +0200781 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100782 my $pool = Parallel::ForkManager->new($jobs);
783
Akron7d4cdd82016-08-17 21:39:45 +0200784 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100785 my $iter = 1; # Current text in process
786
Akronda3097e2017-04-23 19:53:57 +0200787 my $tar_archive;
788 my $output_dir = $output;
789 my $tar_fh;
Akroncb12af72025-07-15 14:36:10 +0200790 my $final_tar_file;
791 my %tar_pool;
792 my $next_tar = 1; # Counter for tar assignment
Akronda3097e2017-04-23 19:53:57 +0200793
794 # Initialize tar archive
795 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200796 # Set output name
Akroncb12af72025-07-15 14:36:10 +0200797 $final_tar_file = $output;
798 unless ($final_tar_file =~ /\.tar$/) {
799 $final_tar_file .= '.tar';
Akronda3097e2017-04-23 19:53:57 +0200800 };
801
Akroncb12af72025-07-15 14:36:10 +0200802 print "Writing to file $final_tar_file\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200803
Akroncb12af72025-07-15 14:36:10 +0200804 # Create tar pool with size equal to number of jobs
805 # If jobs is 0, create just one tar file
806 my $pool_size = $jobs > 0 ? $jobs : 1;
807 for my $i (1..$pool_size) {
808 my ($fh, $temp_tar) = tempfile(
809 "korapxml2krill_pool_${i}_XXXX",
810 SUFFIX => '.tar',
811 TMPDIR => 1
Akroneb370a02022-02-24 13:33:40 +0100812 );
813
Akroncb12af72025-07-15 14:36:10 +0200814 $tar_pool{$i} = {
815 fh => $fh,
816 file => $temp_tar,
817 };
Akroneb370a02022-02-24 13:33:40 +0100818
Akroncb12af72025-07-15 14:36:10 +0200819 if (eval("use Archive::Tar::Builder; 1;")) {
820 ($tar_pool{$i}->{archive} = Archive::Tar::Builder->new(ignore_errors => 1))->set_handle($fh);
821 } else {
822 $tar_pool{$i}->{archive} = KorAP::XML::TarBuilder->new($fh);
823 }
Akroneb370a02022-02-24 13:33:40 +0100824 };
Akronda3097e2017-04-23 19:53:57 +0200825
826 # Output to temporary directory
827 $output_dir = File::Temp->newdir;
828 };
829
Akron941c1a62016-02-23 17:41:41 +0100830 # Report on fork message
831 $pool->run_on_finish (
832 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200833 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100834 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200835
Akrona3518372024-01-22 23:29:00 +0100836 unless ($q) {
837 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
838 $iter . "/$count]" .
839 ($code ? " $code" : '') .
840 ' ' . $data->[0] . "\n";
841 };
842 $iter++;
Akronda3097e2017-04-23 19:53:57 +0200843
844 if (!$code && $to_tar && $data->[2]) {
845 my $filename = $data->[2];
Akroncb12af72025-07-15 14:36:10 +0200846 my $clean_file = fileparse($filename);
Akronda3097e2017-04-23 19:53:57 +0200847
Akroncb12af72025-07-15 14:36:10 +0200848 # Get next available tar file in round-robin fashion
849 my $pool_size = $jobs > 0 ? $jobs : 1;
850 my $pool_idx = $next_tar;
851 $next_tar = ($next_tar % $pool_size) + 1;
Akronda3097e2017-04-23 19:53:57 +0200852
Akroncb12af72025-07-15 14:36:10 +0200853 my $tar = $tar_pool{$pool_idx};
Akron9a062ce2017-07-04 19:12:05 +0200854
Akroncb12af72025-07-15 14:36:10 +0200855 # Lock the tar file before writing
856 flock($tar->{fh}, LOCK_EX);
Akronda3097e2017-04-23 19:53:57 +0200857
Akroncb12af72025-07-15 14:36:10 +0200858 # Add file to pool tar
859 $tar->{archive}->archive_as($filename => $clean_file);
860
861 # Release lock
862 flock($tar->{fh}, LOCK_UN);
863
864 unlink $filename;
Akronda3097e2017-04-23 19:53:57 +0200865 };
866
Akron4c0cf312016-10-15 16:42:09 +0200867 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100868 }
869 );
870
871 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200872 my $temp;
Akrona3518372024-01-22 23:29:00 +0100873 print "Reading data ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100874
Akron7d4cdd82016-08-17 21:39:45 +0200875 # unless (Cache::FastMmap->new(
876 # share_file => $cache_file,
877 # cache_size => $cache_size,
878 # init_file => $cache_init
879 # )) {
880 # print "Unable to intialize cache '$cache_file'\n\n";
881 # exit(1);
882 # };
Akron11c80302016-03-18 19:44:43 +0100883
Akron486f9ab2017-04-22 23:25:19 +0200884
Akron941c1a62016-02-23 17:41:41 +0100885 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100886 if (-d $input[0]) {
Akron941c1a62016-02-23 17:41:41 +0100887
Akron8b03ba52025-07-15 09:16:18 +0200888 # First pass: count files
889 my $rule_count = Path::Iterator::Rule->new;
890 $rule_count->name('data.xml')->file;
891 my $count_iter = $rule_count->iter(
892 $input[0] => {
893 sorted => 0,
894 depthfirst => -1,
895 error_handler => undef
896 });
897 $count = 0;
898 while (defined(my $file = $count_iter->())) {
899 $count++;
900 };
Akronce033502024-09-11 10:51:49 +0200901
Akron8b03ba52025-07-15 09:16:18 +0200902 print "Start processing ...\n" unless $q;
903 $t = Benchmark->new;
904
905 # Second pass: process files using iterator
Akron0a0d1f92024-11-14 14:31:42 +0100906 my $rule = Path::Iterator::Rule->new;
907 $rule->name('data.xml')->file;
908 my $next = $rule->iter(
909 $input[0] => {
910 sorted => 0,
911 depthfirst => -1,
912 error_handler => undef
913 });
Akron941c1a62016-02-23 17:41:41 +0100914
915 DIRECTORY_LOOP:
Akron8b03ba52025-07-15 09:16:18 +0200916 while (defined(my $file = $next->())) {
917 # Remove data.xml suffix to get directory path
918 $file =~ s/\/data\.xml$//;
Akron941c1a62016-02-23 17:41:41 +0100919
Akrone1dbc382016-07-08 22:24:52 +0200920 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200921 $output_dir,
Akron8b03ba52025-07-15 09:16:18 +0200922 get_file_name($input[0], $file) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200923 );
Akron941c1a62016-02-23 17:41:41 +0100924
925 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200926 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200927
Akron8b03ba52025-07-15 09:16:18 +0200928 if (my $return = $batch_file->process($file => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200929 $pool->finish(
930 0,
Akronda3097e2017-04-23 19:53:57 +0200931 [
932 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
933 undef,
934 $filename
935 ]
Akron486f9ab2017-04-22 23:25:19 +0200936 );
Akron3ec48972016-08-17 23:24:52 +0200937 }
938 else {
Akron8b03ba52025-07-15 09:16:18 +0200939 $pool->finish(1, ["Unable to process " . $file]);
Akron3ec48972016-08-17 23:24:52 +0200940 };
Akron941c1a62016-02-23 17:41:41 +0100941 };
942 }
943
944 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200945 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200946
Akron941c1a62016-02-23 17:41:41 +0100947 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200948 $log->error("Unzip is not installed or incompatible.");
949 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100950 };
951
Akron08385f62016-03-22 20:37:04 +0100952 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200953 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100954
Akron31a08cb2019-02-20 20:43:26 +0100955 # Get sigles to extract
956 my $prefix = set_sigle($archive);
957
Akrona3518372024-01-22 23:29:00 +0100958 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100959 $t = Benchmark->new;
Akron8b03ba52025-07-15 09:16:18 +0200960
961 # Get count of texts
962 $count = $archive->count_texts;
963
964 # Get iterator for text paths
965 my $text_iter = $archive->list_texts_iterator;
966
967 # Process texts one at a time using the iterator
968 ARCHIVE_LOOP:
969 while (defined(my $text_path = $text_iter->())) {
Akron941c1a62016-02-23 17:41:41 +0100970 # Split path information
Akron8b03ba52025-07-15 09:16:18 +0200971 my ($prefix, $corpus, $doc, $text) = $archive->split_path($text_path);
Akron941c1a62016-02-23 17:41:41 +0100972
Akrone1dbc382016-07-08 22:24:52 +0200973 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200974 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200975 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200976 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200977 catfile($corpus, $doc, $text)
978 . '.json' . ($gzip ? '.gz' : '')
979 )
Akrone1dbc382016-07-08 22:24:52 +0200980 );
Akron941c1a62016-02-23 17:41:41 +0100981
982 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200983 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100984
Akron4c0cf312016-10-15 16:42:09 +0200985 # Create temporary file
986 $temp = File::Temp->newdir;
987
Akronbdf434a2016-10-24 17:42:07 +0200988 # TODO: Check if $filename exist at the beginning,
989 # because extraction can be horrible slow!
990
Akron941c1a62016-02-23 17:41:41 +0100991 # Extract from archive
Akrona3518372024-01-22 23:29:00 +0100992 if ($archive->extract_sigle($q, [join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100993
Akron7d4cdd82016-08-17 21:39:45 +0200994 # Create corpus directory
995 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100996
Akron7d4cdd82016-08-17 21:39:45 +0200997 # Temporary directory
998 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100999
Akron7d4cdd82016-08-17 21:39:45 +02001000 # Write file
Akron13d56622016-10-31 14:54:49 +01001001 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +02001002
Akron4c0cf312016-10-15 16:42:09 +02001003 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +01001004 $pool->finish(
1005 0,
Akronda3097e2017-04-23 19:53:57 +02001006 [
1007 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
1008 $temp,
1009 $filename
1010 ]
Akron13d56622016-10-31 14:54:49 +01001011 );
Akron7d4cdd82016-08-17 21:39:45 +02001012 }
1013 else {
Akron4c0cf312016-10-15 16:42:09 +02001014 # Delete temporary file
1015 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001016 };
Akron941c1a62016-02-23 17:41:41 +01001017 }
Akron7d4cdd82016-08-17 21:39:45 +02001018
1019 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001020 else {
Akron8b03ba52025-07-15 09:16:18 +02001021 $pool->finish(1, ["Unable to extract " . $text_path, $temp]);
Akron941c1a62016-02-23 17:41:41 +01001022 };
1023 };
1024 }
1025
1026 else {
Akrona3518372024-01-22 23:29:00 +01001027 print "Input is neither a directory nor an archive.\n\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +01001028 };
1029
1030 $pool->wait_all_children;
1031
Akroncb12af72025-07-15 14:36:10 +02001032 # Merge all temporary tar files into final tar if needed
1033 if ($to_tar && %tar_pool) {
1034 $| = 1;
1035 print "Merging " . scalar(keys %tar_pool) . " temporary tar files...\n" unless $q;
1036
1037 # Open final tar file
1038 my $final_fh = IO::File->new($final_tar_file, 'w') or die "Cannot open $final_tar_file: $!";
1039 $final_fh->binmode(1);
1040
1041 # Create final archive
1042 my $final_archive;
1043
1044 if (eval("use Archive::Tar::Builder; 1;")) {
1045 $final_archive = Archive::Tar::Builder->new(ignore_errors => 1);
1046 $final_archive->set_handle($final_fh);
1047 } else {
1048 $final_archive = KorAP::XML::TarBuilder->new($final_fh);
1049 }
1050
1051 # Finish and close all pool tar files
1052 foreach my $pool_idx (sort keys %tar_pool) {
1053 my $tar = $tar_pool{$pool_idx};
1054 $tar->{archive}->finish;
1055 $tar->{fh}->close;
1056
1057 # Append temp tar content to final tar using efficient buffered copy
1058 open my $temp_fh, '<:raw', $tar->{file} or die "Cannot open temp tar $tar->{file}: $!";
1059 my $buffer_size = 1024 * 1024; # 1MB buffer
1060 my $buffer;
1061 while (my $bytes_read = read($temp_fh, $buffer, $buffer_size)) {
1062 my $bytes_written = 0;
1063 while ($bytes_written < $bytes_read) {
1064 my $written = syswrite($final_fh, $buffer, $bytes_read - $bytes_written, $bytes_written);
1065 die "Write error: $!" unless defined $written;
1066 $bytes_written += $written;
1067 }
1068 }
1069 close $temp_fh;
1070
1071 # Clean up temp tar
1072 unlink $tar->{file};
1073 }
1074
1075 # Close final tar
1076 $final_archive->finish;
1077 $final_fh->close;
1078 print "Wrote to tar archive $final_tar_file\n" unless $q;
1079 }
1080
Akrona3518372024-01-22 23:29:00 +01001081 unless ($q) {
1082 print timestr(timediff(Benchmark->new, $t))."\n";
1083 print "Done.\n";
1084 };
Akron81500102017-04-07 20:45:44 +02001085};
Akron941c1a62016-02-23 17:41:41 +01001086
Nils Diewald2db9ad02013-10-29 19:26:43 +00001087
Akron31a08cb2019-02-20 20:43:26 +01001088# For an archive, this will create the list
1089# of all sigles to process
1090sub set_sigle {
1091 my $archive = shift;
1092
1093 my $prefix = 1;
1094 my @dirs = ();
1095
1096 # No sigles given
1097 unless (@sigle) {
1098
1099 # Get files
1100 foreach ($archive->list_texts) {
1101
1102 push @dirs, $_;
1103
1104 # Split path information
1105 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1106
1107 # TODO: Make this OS independent
1108 push @sigle, join '/', $corpus, $doc, $text;
1109 };
1110 }
1111
1112 # Check sigle for doc sigles
1113 else {
1114 my @new_sigle;
1115
1116 my $prefix_check = 0;
1117
1118 # Iterate over all sigle
1119 foreach (@sigle) {
1120
1121 # Sigle is a doc sigle
1122 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1123
Akrona3518372024-01-22 23:29:00 +01001124 print "$_ ..." unless $q;
Akron31a08cb2019-02-20 20:43:26 +01001125 # Check if a prefix is needed
1126 unless ($prefix_check) {
1127
Akrona3518372024-01-22 23:29:00 +01001128 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001129 print " with prefix ...";
1130 };
1131 $prefix_check = 1;
1132 };
1133
Akrona3518372024-01-22 23:29:00 +01001134 unless ($q) {
1135 print "\n";
Akron31a08cb2019-02-20 20:43:26 +01001136
Akrona3518372024-01-22 23:29:00 +01001137 print '... ' . (
1138 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
Akron955b75b2019-02-21 14:28:41 +01001139 ? '' : 'not '
Akrona3518372024-01-22 23:29:00 +01001140 );
1141 print "extracted.\n";
1142 }
1143 else {
1144 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
1145 };
Akron31a08cb2019-02-20 20:43:26 +01001146 }
1147
1148 # Sigle is a text sigle
1149 else {
1150 push @new_sigle, $_;
1151
1152 unless ($prefix_check) {
1153
Akrona3518372024-01-22 23:29:00 +01001154 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001155 print " with prefix ...";
1156 };
1157 $prefix_check = 1;
1158 };
1159 };
1160 };
1161 @sigle = @new_sigle;
1162 };
1163
1164 return $prefix;
1165};
1166
1167
Akron63f20d42017-04-10 23:40:29 +02001168# Cleanup temporary extraction directory
1169if ($extract_dir) {
1170 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001171 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001172};
1173
1174
1175print "\n";
1176
Nils Diewald2db9ad02013-10-29 19:26:43 +00001177__END__
Akron941c1a62016-02-23 17:41:41 +01001178
1179=pod
1180
1181=encoding utf8
1182
1183=head1 NAME
1184
Akron42f48c12020-02-14 13:08:13 +01001185korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001186
1187
1188=head1 SYNOPSIS
1189
Akron9cb8c982024-03-22 10:46:56 +01001190 $ korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001191
Akron2fd402b2016-10-27 21:26:48 +02001192
Akron941c1a62016-02-23 17:41:41 +01001193=head1 DESCRIPTION
1194
1195L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1196compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001197The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001198
1199
1200=head1 INSTALLATION
1201
1202The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1203
Akron9cb8c982024-03-22 10:46:56 +01001204 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001205
Akronc13a1702016-03-15 19:33:14 +01001206In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001207be available on your command line immediately.
Akron8ce23f72023-12-13 15:48:49 +01001208Minimum requirement for L<KorAP::XML::Krill> is Perl 5.32.
Akroneb370a02022-02-24 13:33:40 +01001209Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1210Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001211In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001212
1213=head1 ARGUMENTS
1214
Akron9cb8c982024-03-22 10:46:56 +01001215 $ korapxml2krill -z --input <directory> --output <filename>
Akrona76d8352016-10-27 16:27:32 +02001216
1217Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001218It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001219
Akron941c1a62016-02-23 17:41:41 +01001220=over 2
1221
1222=item B<archive>
1223
Akron9cb8c982024-03-22 10:46:56 +01001224 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001225
Akron2fd402b2016-10-27 21:26:48 +02001226Converts an archive of KorAP-XML documents. It expects a directory
1227(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001228
1229=item B<extract>
1230
Akron9cb8c982024-03-22 10:46:56 +01001231 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Akrona76d8352016-10-27 16:27:32 +02001232
1233Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001234
Akron63f20d42017-04-10 23:40:29 +02001235=item B<serial>
1236
Akron9cb8c982024-03-22 10:46:56 +01001237 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Akron63f20d42017-04-10 23:40:29 +02001238
Akronce033502024-09-11 10:51:49 +02001239Convert archives in serial. The inputs are not merged but treated
Akron63f20d42017-04-10 23:40:29 +02001240as they are (so they may be premerged or globs).
1241the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001242are created based on the archive name. In case the C<--to-tar> flag is given,
1243the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001244
1245
Akron84b53ad2022-01-14 12:39:15 +01001246=item B<slimlog>
1247
Akron9cb8c982024-03-22 10:46:56 +01001248 $ korapxml2krill slimlog <logfile> > <logfile-slim>
Akron84b53ad2022-01-14 12:39:15 +01001249
1250Filters out all useless aka succesfull information from logs, to simplify
1251log checks. Expects no further options.
1252
1253
Akron941c1a62016-02-23 17:41:41 +01001254=back
1255
1256
1257=head1 OPTIONS
1258
1259=over 2
1260
Akrona76d8352016-10-27 16:27:32 +02001261=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001262
Akrona76d8352016-10-27 16:27:32 +02001263Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001264
Akron7606afa2016-10-25 16:23:49 +02001265Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001266document, while C<archive> expects a KorAP-XML corpus folder or a zip
1267file to batch process multiple files.
1268C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001269
Akrondee3cf62024-06-14 18:14:48 +02001270C<archive> supports multiple input zip files with the constraint
Akron2cfe8092016-06-24 17:48:49 +02001271that the first archive listed contains all primary data files
1272and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001273
Akron7606afa2016-10-25 16:23:49 +02001274 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001275
Akron821db3d2017-04-06 21:19:31 +02001276Input may also be defined using BSD glob wildcards.
1277
1278 -i 'file/news*.zip'
1279
1280The extended input array will be sorted in length order, so the shortest
1281path needs to contain all primary data files and all meta data files.
1282
Akrondee3cf62024-06-14 18:14:48 +02001283(The directory structure follows the base directory format
Akron0c3e3752016-06-28 15:55:53 +02001284that may include a C<.> root folder.
1285In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001286need to be passed with a hash sign in front of the archive's name.
1287This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001288
Akron7606afa2016-10-25 16:23:49 +02001289To support zip files, a version of C<unzip> needs to be installed that is
1290compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001291
Akron7606afa2016-10-25 16:23:49 +02001292B<The root folder switch using the hash sign is experimental and
1293may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001294
Akronf73ffb62018-06-27 12:13:59 +02001295
Akron63f20d42017-04-10 23:40:29 +02001296=item B<--input-base|-ib> <directory>
1297
1298The base directory for inputs.
1299
1300
Akron941c1a62016-02-23 17:41:41 +01001301=item B<--output|-o> <directory|file>
1302
1303Output folder for archive processing or
1304document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001305writes to C<STDOUT> by default
1306(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001307
1308=item B<--overwrite|-w>
1309
1310Overwrite files that already exist.
1311
Akronf73ffb62018-06-27 12:13:59 +02001312
Akron3741f8b2016-12-21 19:55:21 +01001313=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001314
1315Define the default tokenization by specifying
1316the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001317of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001318This will directly take the file instead of running
1319the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001320
Akron3741f8b2016-12-21 19:55:21 +01001321
1322=item B<--base-sentences|-bs> <foundry>#<layer>
1323
1324Define the layer for base sentences.
1325If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001326Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1327layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001328
1329 Defaults to unset.
1330
1331
1332=item B<--base-paragraphs|-bp> <foundry>#<layer>
1333
1334Define the layer for base paragraphs.
1335If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001336Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1337layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001338
1339 Defaults to unset.
1340
1341
Akron41ac10b2017-02-08 22:47:25 +01001342=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1343
1344Define the layer for base pagebreaks.
1345Currently C<DeReKo#Structure> is the only layer supported.
1346
1347 Defaults to unset.
1348
1349
Akron941c1a62016-02-23 17:41:41 +01001350=item B<--skip|-s> <foundry>[#<layer>]
1351
Akronf7ad89e2016-03-16 18:22:47 +01001352Skip specific annotations by specifying the foundry
1353(and optionally the layer with a C<#>-prefix),
1354e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001355Can be set multiple times.
1356
Akronf73ffb62018-06-27 12:13:59 +02001357
Akronc13a1702016-03-15 19:33:14 +01001358=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001359
Akronf7ad89e2016-03-16 18:22:47 +01001360Convert specific annotations by specifying the foundry
1361(and optionally the layer with a C<#>-prefix),
1362e.g. C<Mate> or C<Mate#Morpho>.
1363Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001364
Akronf73ffb62018-06-27 12:13:59 +02001365
Akroned9baf02019-01-22 17:03:25 +01001366=item B<--non-word-tokens|-nwt>
1367
1368Tokenize non-word tokens like word tokens (defined as matching
1369C</[\d\w]/>). Useful to treat punctuations as tokens.
1370
1371 Defaults to unset.
1372
Akronf1849aa2019-12-16 23:35:33 +01001373
1374=item B<--non-verbal-tokens|-nvt>
1375
1376Tokenize non-verbal tokens marked as in the primary data as
1377the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1378
1379 Defaults to unset.
1380
1381
Akron941c1a62016-02-23 17:41:41 +01001382=item B<--jobs|-j>
1383
Akron29128262024-04-17 15:50:36 +02001384Define the number of spawned forks for concurrent jobs
1385of archive processing.
Akron11c80302016-03-18 19:44:43 +01001386Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001387
Akrona472a242023-02-13 13:46:30 +01001388If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001389also apply to extraction.
1390
Akronebbac2e2024-03-22 10:31:23 +01001391Pass C<-1>, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001392times the number of available cores, in case L<Sys::Info>
Akronebbac2e2024-03-22 10:31:23 +01001393is available and can read CPU count (see C<--job-count>).
1394Be aware, that the report of available cores
Akron29128262024-04-17 15:50:36 +02001395may not work in certain conditions. Benchmarking the processing
1396speed based on the number of jobs may be valuable.
Akronebbac2e2024-03-22 10:31:23 +01001397
Akronf7ad89e2016-03-16 18:22:47 +01001398This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001399
Akronf73ffb62018-06-27 12:13:59 +02001400
Akronebbac2e2024-03-22 10:31:23 +01001401=item B<--job-count|-jc>
1402
1403Print job and core information that would be used if
1404C<-1> was passed to C<--jobs>.
1405
1406
Akron263274c2019-02-07 09:48:30 +01001407=item B<--koral|-k>
1408
1409Version of the output format. Supported versions are:
1410C<0> for legacy serialization, C<0.03> for serialization
1411with metadata fields as key-values on the root object,
1412C<0.4> for serialization with metadata fields as a list
1413of C<"@type":"koral:field"> objects.
1414
1415Currently defaults to C<0.03>.
1416
1417
Akron9ec88872017-04-12 16:29:06 +02001418=item B<--sequential-extraction|-se>
1419
1420Flag to indicate, if the C<jobs> value also applies to extraction.
1421Some systems may have problems with extracting multiple archives
1422to the same folder at the same time.
1423Can be flagged using C<--no-sequential-extraction> as well.
1424Defaults to C<false>.
1425
Akronf73ffb62018-06-27 12:13:59 +02001426
Akron35db6e32016-03-17 22:42:22 +01001427=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001428
Akron35db6e32016-03-17 22:42:22 +01001429Define the metadata parser to use. Defaults to C<I5>.
1430Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1431This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001432
Akronf73ffb62018-06-27 12:13:59 +02001433
Akron941c1a62016-02-23 17:41:41 +01001434=item B<--gzip|-z>
1435
Akronf7ad89e2016-03-16 18:22:47 +01001436Compress the output.
1437Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001438
Akronf73ffb62018-06-27 12:13:59 +02001439
Akron11c80302016-03-18 19:44:43 +01001440=item B<--cache|-c>
1441
1442File to mmap a cache (using L<Cache::FastMmap>).
1443Defaults to C<korapxml2krill.cache> in the calling directory.
1444
Akronf73ffb62018-06-27 12:13:59 +02001445
Akron11c80302016-03-18 19:44:43 +01001446=item B<--cache-size|-cs>
1447
1448Size of the cache. Defaults to C<50m>.
1449
Akronf73ffb62018-06-27 12:13:59 +02001450
Akron11c80302016-03-18 19:44:43 +01001451=item B<--cache-init|-ci>
1452
1453Initialize cache file.
1454Can be flagged using C<--no-cache-init> as well.
1455Defaults to C<true>.
1456
Akronf73ffb62018-06-27 12:13:59 +02001457
Akron11c80302016-03-18 19:44:43 +01001458=item B<--cache-delete|-cd>
1459
1460Delete cache file after processing.
1461Can be flagged using C<--no-cache-delete> as well.
1462Defaults to C<true>.
1463
Akronf73ffb62018-06-27 12:13:59 +02001464
Akron636aa112017-04-07 18:48:56 +02001465=item B<--config|-cfg>
1466
1467Configure the parameters of your call in a file
1468of key-value pairs with whitespace separator
1469
1470 overwrite 1
1471 token DeReKo#Structure
1472 ...
1473
1474Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001475C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akronc0ac4ff2024-04-15 18:03:15 +02001476C<token>, C<log>,
1477C<cache>, C<cache-size>, C<cache-init>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001478C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001479C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001480C<base-sentences>, C<base-paragraphs>,
1481C<base-pagebreaks>,
1482C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001483(semicolon separated), C<anno> (semicolon separated).
1484
Akronf73ffb62018-06-27 12:13:59 +02001485Configuration parameters will always be overwritten by
1486passed parameters.
1487
1488
Akron81500102017-04-07 20:45:44 +02001489=item B<--temporary-extract|-te>
1490
Akrona472a242023-02-13 13:46:30 +01001491Only valid for the C<archive> and C<serial>
1492commands.
Akron81500102017-04-07 20:45:44 +02001493
1494This will first extract all files into a
1495directory and then will archive.
1496If the directory is given as C<:temp:>,
1497a temporary directory is used.
1498This is especially useful to avoid
1499massive unzipping and potential
1500network latency.
Akron636aa112017-04-07 18:48:56 +02001501
Akronf73ffb62018-06-27 12:13:59 +02001502
Akronc93a0802019-07-11 15:48:34 +02001503=item B<--to-tar>
1504
1505Only valid for the C<archive> command.
1506
1507Writes the output into a tar archive.
Akronec01ff42025-10-17 11:59:33 +02001508The tar needs to be opened with C<--ignore-zeros> afterwards.
Akronc93a0802019-07-11 15:48:34 +02001509
1510
Akrone10ad322016-02-27 10:54:26 +01001511=item B<--sigle|-sg>
1512
Akron20807582016-10-26 17:11:34 +02001513Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001514Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001515I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001516Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001517In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001518On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001519
Akron64f7fae2022-07-27 12:45:33 +02001520=item B<--lang>
1521
1522Preferred language for metadata fields. In case multiple titles are
1523given (on any level) with different C<xml:lang> attributes,
1524the language given is preferred.
1525Because titles may have different sources and different priorities,
1526non-specific language titles may still be preferred in case the title
1527source has a higher priority.
1528
Akronf73ffb62018-06-27 12:13:59 +02001529
Akron941c1a62016-02-23 17:41:41 +01001530=item B<--log|-l>
1531
Akronb9c33812020-10-21 16:19:35 +02001532The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001533
Akronf73ffb62018-06-27 12:13:59 +02001534
Akrona3518372024-01-22 23:29:00 +01001535=item B<--quiet>
1536
1537Silence all information (non-log) outputs.
1538
1539
Akron941c1a62016-02-23 17:41:41 +01001540=item B<--help|-h>
1541
Akron42f48c12020-02-14 13:08:13 +01001542Print help information.
Akron941c1a62016-02-23 17:41:41 +01001543
Akronf73ffb62018-06-27 12:13:59 +02001544
Akron941c1a62016-02-23 17:41:41 +01001545=item B<--version|-v>
1546
1547Print version information.
1548
1549=back
1550
Akron311e29b2024-09-11 11:46:09 +02001551=head1 PERFORMANCE
1552
1553There are some ways to improve performance for large tasks:
1554
Akronec01ff42025-10-17 11:59:33 +02001555=over 2
1556
Akron311e29b2024-09-11 11:46:09 +02001557=item First unpack
1558
1559Using the archive or serial command on one or multiple zip files
1560can be very slow, as it needs to unpack small portions every time.
1561It's better to use C<--temporary-extract> to unpack the whole archive
1562first into a temprary directory and then read the extracted files.
1563This is especially important for remote archives
1564
1565=item Limit annotations
1566
1567Per default, all supported annotation layers are sought. This can be limited
1568by adding C<--skip '#ALL'> and only listing the expected annotations with C<--anno>.
1569
1570=item Checking the parallel job count
1571
1572By providing the number of parallel jobs using C<--jobs>, the execution can be tailored to specific
1573hardware environments.
1574
Marc Kupietzaeac7532025-04-14 20:00:33 +02001575=item Install ripunzip
1576
1577For full extraction of data, L<ripunzip|https://github.com/google/ripunzip> can be
1578used for improved performance.
1579
Akronec01ff42025-10-17 11:59:33 +02001580=back
Marc Kupietzaeac7532025-04-14 20:00:33 +02001581
Akronc13a1702016-03-15 19:33:14 +01001582=head1 ANNOTATION SUPPORT
1583
1584L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1585developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1586The base foundry with paragraphs, sentences, and the text element are mandatory for
1587L<Krill|https://github.com/KorAP/Krill>.
1588
Akron821db3d2017-04-06 21:19:31 +02001589 Base
1590 #Paragraphs
1591 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001592
Akron821db3d2017-04-06 21:19:31 +02001593 Connexor
1594 #Morpho
1595 #Phrase
1596 #Sentences
1597 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001598
Akron821db3d2017-04-06 21:19:31 +02001599 CoreNLP
1600 #Constituency
1601 #Morpho
1602 #NamedEntities
1603 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001604
Akron5530a552022-02-17 17:53:15 +01001605 CorpusExplorer
1606 #Morpho
1607
Akronce125b62017-06-19 11:54:36 +02001608 CMC
1609 #Morpho
1610
Akron821db3d2017-04-06 21:19:31 +02001611 DeReKo
1612 #Structure
Akronc13a1702016-03-15 19:33:14 +01001613
Akron57510c12019-01-04 14:58:53 +01001614 DGD
1615 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001616 #Structure
Akron57510c12019-01-04 14:58:53 +01001617
Akron821db3d2017-04-06 21:19:31 +02001618 DRuKoLa
1619 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001620
Akron821db3d2017-04-06 21:19:31 +02001621 Glemm
1622 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001623
Akronabb36902021-10-11 15:51:06 +02001624 Gingko
1625 #Morpho
1626
Akronea1aed52018-07-19 14:43:34 +02001627 HNC
1628 #Morpho
1629
Akron4c679192018-01-16 17:41:49 +01001630 LWC
1631 #Dependency
1632
Akron821db3d2017-04-06 21:19:31 +02001633 Malt
1634 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001635
Akron821db3d2017-04-06 21:19:31 +02001636 MarMoT
1637 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001638
Akron821db3d2017-04-06 21:19:31 +02001639 Mate
1640 #Dependency
1641 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001642
Akron821db3d2017-04-06 21:19:31 +02001643 MDParser
1644 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001645
Akrone85a7762022-07-22 08:05:03 +02001646 NKJP
1647 #Morpho
1648 #NamedEntities
1649
Akron821db3d2017-04-06 21:19:31 +02001650 OpenNLP
1651 #Morpho
1652 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001653
Akron07e24772020-04-23 14:00:54 +02001654 RWK
1655 #Morpho
1656 #Structure
1657
Akron821db3d2017-04-06 21:19:31 +02001658 Sgbr
1659 #Lemma
1660 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001661
Marc Kupietzb8c53822024-03-16 18:54:08 +01001662 Spacy
Marc Kupietz23446562025-10-28 14:36:50 +01001663 #Dependency
Marc Kupietzb8c53822024-03-16 18:54:08 +01001664 #Morpho
1665
Akron7d5e6382019-08-08 16:36:27 +02001666 Talismane
1667 #Dependency
1668 #Morpho
1669
Akron821db3d2017-04-06 21:19:31 +02001670 TreeTagger
1671 #Morpho
1672 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001673
Akron83aedd32023-02-07 10:57:41 +01001674 UDPipe
1675 #Dependency
1676 #Morpho
1677
Akron821db3d2017-04-06 21:19:31 +02001678 XIP
1679 #Constituency
1680 #Morpho
1681 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001682
Akronc13a1702016-03-15 19:33:14 +01001683
1684More importers are in preparation.
1685New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1686See the built-in annotation importers as examples.
1687
Akronf73ffb62018-06-27 12:13:59 +02001688
Akron41e6c8b2021-10-14 20:22:18 +02001689=head1 METADATA SUPPORT
1690
1691L<KorAP::XML::Krill> has built-in importer for some meta data variants
Akron4b001ce2024-06-06 12:32:11 +02001692that are part of the KorAP preprocessing pipeline.
Akron41e6c8b2021-10-14 20:22:18 +02001693
1694=over 2
1695
Akron1d101492024-06-06 12:47:35 +02001696=item B<I5>
Akron41e6c8b2021-10-14 20:22:18 +02001697
Akron1d101492024-06-06 12:47:35 +02001698Meta data for all I5 files
Akron41e6c8b2021-10-14 20:22:18 +02001699
Akronec01ff42025-10-17 11:59:33 +02001700Environment variables:
1701
1702=over 4
1703
1704=item C<K2K_TRANSLATOR_TEXT>
1705
1706Index the translator as a text field (attachement otherwise).
1707
1708=item C<K2K_PUBLISHER_STRING>
1709
1710Index the publisher as a string field (attachement otherwise).
1711
1712
1713=back
1714
Akron1d101492024-06-06 12:47:35 +02001715=item B<Sgbr>
Akron41e6c8b2021-10-14 20:22:18 +02001716
Akron1d101492024-06-06 12:47:35 +02001717Meta data from the Schreibgebrauch project
Akron2532f1b2023-05-15 13:41:24 +02001718
Akron1d101492024-06-06 12:47:35 +02001719=item B<Gingko>
1720
1721Meta data from the Gingko project in addition to I5
1722
1723=item B<ICC>
1724
1725Meta data for the ICC in addition to I5
1726
1727=item B<NKJP>
1728
1729Meta data for the NKJP corpora
Akron24ad3c02024-06-03 12:38:20 +02001730
Akron41e6c8b2021-10-14 20:22:18 +02001731=back
1732
Akron41e6c8b2021-10-14 20:22:18 +02001733New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1734See the built-in meta data importers as examples.
1735
Akron4b001ce2024-06-06 12:32:11 +02001736The I5 metadata definition is based on TEI-P5 and supports C<E<lt>xenoDataE<gt>>
Akron82064bb2024-06-17 12:53:23 +02001737with C<E<lt>metaE<gt>> elements like
Akron4b001ce2024-06-06 12:32:11 +02001738
1739 <meta type="..." name="..." project="..." desc="...">...</meta>
1740
1741that are directly translated to Krill objects. The supported values are:
1742
1743=over 2
1744
Akron1d101492024-06-06 12:47:35 +02001745=item C<type>
Akron4b001ce2024-06-06 12:32:11 +02001746
1747=over 4
1748
Akron1d101492024-06-06 12:47:35 +02001749=item C<string>
Akron4b001ce2024-06-06 12:32:11 +02001750
Akron1d101492024-06-06 12:47:35 +02001751String meta data value
Akron4b001ce2024-06-06 12:32:11 +02001752
Akron1d101492024-06-06 12:47:35 +02001753=item C<keyword>
Akron4b001ce2024-06-06 12:32:11 +02001754
Akrondee3cf62024-06-14 18:14:48 +02001755String meta data value that can be given multiple times
Akron4b001ce2024-06-06 12:32:11 +02001756
Akron1d101492024-06-06 12:47:35 +02001757=item C<text>
Akron4b001ce2024-06-06 12:32:11 +02001758
Akrondee3cf62024-06-14 18:14:48 +02001759String meta data value that is tokenized and can be searched as token sequences
Akron4b001ce2024-06-06 12:32:11 +02001760
Akron1d101492024-06-06 12:47:35 +02001761=item C<date>
1762
1763Date meta data value (as "yyyy/mm/dd" with optional granularity)
1764
1765=item C<integer>
1766
1767Numerical meta data value
1768
Akrondee3cf62024-06-14 18:14:48 +02001769=item C<attachment>
Akron1d101492024-06-06 12:47:35 +02001770
1771Non-indexed meta data value (only retrievable)
1772
1773=item C<uri>
1774
1775Non-indexed attached URI, takes the desc as the title for links
Akron4b001ce2024-06-06 12:32:11 +02001776
1777=back
1778
Akron1d101492024-06-06 12:47:35 +02001779=item C<name>
Akron4b001ce2024-06-06 12:32:11 +02001780
Akrondee3cf62024-06-14 18:14:48 +02001781The key of the meta object that may be prefixed by C<corpus> or C<doc>, in case the
Akron693f5882024-06-06 12:52:39 +02001782C<E<lt>xenoDataE<gt>> information is located on these levels. The text level introduces
1783no prefixes.
Akron4b001ce2024-06-06 12:32:11 +02001784
Akron1d101492024-06-06 12:47:35 +02001785=item C<project> (optional)
Akron4b001ce2024-06-06 12:32:11 +02001786
Akron1d101492024-06-06 12:47:35 +02001787A prefixed namespace of the key
1788
1789=item C<desc> (optional)
1790
1791A description of the key
1792
1793=item text content
1794
1795The value of the meta object
Akron4b001ce2024-06-06 12:32:11 +02001796
1797=back
1798
Akron41e6c8b2021-10-14 20:22:18 +02001799
Akron8f69d632020-01-15 16:58:11 +01001800=head1 About KorAP-XML
1801
1802KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1803data model (Bański et al. 2013), where text data are stored physically
1804separated from their interpretations (i.e. annotations).
1805A text document in KorAP-XML therefore consists of several files
1806containing primary data, metadata and annotations.
1807
1808The structure of a single KorAP-XML document can be as follows:
1809
1810 - data.xml
1811 - header.xml
1812 + base
1813 - tokens.xml
1814 - ...
1815 + struct
1816 - structure.xml
1817 - ...
1818 + corenlp
1819 - morpho.xml
1820 - constituency.xml
1821 - ...
1822 + tree_tagger
1823 - morpho.xml
1824 - ...
1825 - ...
1826
1827The C<data.xml> contains the primary data, the C<header.xml> contains
1828the metadata, and the annotation layers are stored in subfolders
1829like C<base>, C<struct> or C<corenlp>
1830(so-called "foundries"; Bański et al. 2013).
1831
1832Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001833(Lüngen and Sperberg-McQueen 2012). See the documentation in
1834L<KorAP::XML::Meta::I5> for translatable fields.
1835
1836Annotations correspond to a variant of the TEI-P5 feature structures
1837(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001838Annotation feature structures refer to character sequences of the primary text
1839inside the C<text> element of the C<data.xml>.
1840A single annotation containing the lemma of a token can have the following structure:
1841
1842 <span from="0" to="3">
1843 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1844 <f name="lex">
1845 <fs>
1846 <f name="lemma">zum</f>
1847 </fs>
1848 </f>
1849 </fs>
1850 </span>
1851
1852The C<from> and C<to> attributes are refering to the character span
1853in the primary text.
1854Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1855the structure may vary. See L<KorAP::XML::Annotation::*> for various
1856annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001857
1858Multiple KorAP-XML documents are organized on three levels following
1859the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1860corpus E<gt> document E<gt> text. On each level metadata information
1861can be stored, that C<korapxml2krill> will merge to a single metadata
1862object per text. A corpus is therefore structured as follows:
1863
1864 + <corpus>
1865 - header.xml
1866 + <document>
1867 - header.xml
1868 + <text>
1869 - data.xml
1870 - header.xml
1871 - ...
1872 - ...
1873
1874A single text can be identified by the concatenation of
1875the corpus identifier, the document identifier and the text identifier.
1876This identifier is called the text sigle
1877(e.g. a text with the identifier C<18486> in the document C<060> in the
1878corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1879
1880These corpora are often stored in zip files, with which C<korapxml2krill>
1881can deal with. Corpora may also be split in multiple zip archives
1882(e.g. one zip file per foundry), which is also supported (see C<--input>).
1883
1884Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1885in form of a test suite.
1886The resulting JSON format merges all annotation layers
1887based on a single token stream.
1888
1889=head2 References
1890
1891Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1892KorAP data model: first approximation, December.
1893
1894Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1895"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1896Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1897L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1898
1899Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1900"Robust corpus architecture: a new look at virtual collections and data access",
1901Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1902L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1903
1904Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1905Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1906"Towards an international standard on featurestructure representation",
1907Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1908pp. 373-376.
1909L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1910
1911Harald Lüngen and C. M. Sperberg-McQueen (2012):
1912"A TEI P5 Document Grammar for the IDS Text Model",
1913Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1914L<PDF|https://journals.openedition.org/jtei/pdf/508>
1915
1916TEI Consortium, eds:
1917"Feature Structures",
1918Guidelines for Electronic Text Encoding and Interchange.
1919L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1920
Akron941c1a62016-02-23 17:41:41 +01001921=head1 AVAILABILITY
1922
1923 https://github.com/KorAP/KorAP-XML-Krill
1924
1925
1926=head1 COPYRIGHT AND LICENSE
1927
Akronec01ff42025-10-17 11:59:33 +02001928Copyright (C) 2015-2025, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001929
Akron6882d7d2021-02-08 09:43:57 +01001930Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001931
Akron29128262024-04-17 15:50:36 +02001932Contributor: Eliza Margaretha, Marc Kupietz
Akron941c1a62016-02-23 17:41:41 +01001933
Akron6882d7d2021-02-08 09:43:57 +01001934L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001935Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001936L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001937member of the
Akronf1849aa2019-12-16 23:35:33 +01001938L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001939
1940This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001941L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001942
1943=cut