blob: f2ebe851b7733045ca8e14b16e9159fa83dbf307 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron7d4d2d72024-09-05 11:05:35 +02004use v5.32;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010018use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010019use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020020use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020021use Config::Simple;
Akron0a0d1f92024-11-14 14:31:42 +010022use Path::Iterator::Rule;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akroncb12af72025-07-15 14:36:10 +020025use File::Temp qw/tempdir tempfile/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Marc Kupietzb8c53822024-03-16 18:54:08 +0100174# 2024/03/20
175# - Added Spacy support.
176#
Akronebbac2e2024-03-22 10:31:23 +0100177# 2024/03/22
178# - Improve core count logging.
Akron941c1a62016-02-23 17:41:41 +0100179# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100180
Akroncb12af72025-07-15 14:36:10 +0200181our $LAST_CHANGE = '2025/07/15';
Akron941c1a62016-02-23 17:41:41 +0100182our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100183our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100184our $VERSION_MSG = <<"VERSION";
185Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
186VERSION
187
Akron941c1a62016-02-23 17:41:41 +0100188# Parse comand
189my $cmd;
190our @ARGV;
191if ($ARGV[0] && index($ARGV[0], '-') != 0) {
192 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100193};
Akron63f20d42017-04-10 23:40:29 +0200194my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100195
Akron5f51d422016-08-16 16:26:43 +0200196my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200197
198# Configuration hash
199my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100200
Akronebbac2e2024-03-22 10:31:23 +0100201# Count jobs/cores if not set
202sub count_jobs {
203 my ($cores, $jobs);
204 my $msg = 'Unable to determine number of cores - set to 1';
205 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
206 $cores = Sys::Info->new->device('CPU')->count;
207 if ($cores <= 0) {
208 $log->error($msg);
209 $cores = 1;
210 }
211 }
212 else {
213 $log->error($msg);
214 $cores = 1;
215 };
216
217 $jobs = ceil(5 * $cores);
218 return $jobs, "Run using $jobs jobs on $cores cores";
219}
220
Akron941c1a62016-02-23 17:41:41 +0100221# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000222GetOptions(
Akron08385f62016-03-22 20:37:04 +0100223 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200224 'input-base|ib=s' => \($cfg{input_base}),
225 'output|o=s' => \($cfg{output}),
226 'overwrite|w' => \($cfg{overwrite}),
227 'meta|m=s' => \($cfg{meta}),
228 'token|t=s' => \($cfg{token}),
229 'base-sentences|bs=s' => \($cfg{base_sentences}),
230 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
231 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
232 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100233 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100234 'skip|s=s' => \@skip,
235 'sigle|sg=s' => \@sigle,
Akronc0ac4ff2024-04-15 18:03:15 +0200236 'cache|c=s' => \($cfg{cache}),
Akron636aa112017-04-07 18:48:56 +0200237 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200238 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200239 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200240 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200241 'primary|p!' => sub {
242 warn 'Primary flag no longer supported!';
243 },
Akrona3518372024-01-22 23:29:00 +0100244 'quiet' => \($cfg{quiet}),
Akron6aed0562020-08-07 16:46:00 +0200245 'pretty|y' => sub {
246 warn 'Pretty flag no longer supported!';
247 },
Akronf8df2162020-08-07 15:03:39 +0200248 'jobs|j=i' => \($cfg{jobs}),
249 'koral|k=f' => \($cfg{koral}),
250 'to-tar' => \($cfg{to_tar}),
251 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
252 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
253 'sequential-extraction|se' => \($cfg{sequential_extraction}),
254 'cache-size|cs=s' => \($cfg{cache_size}),
255 'cache-delete|cd!' => \($cfg{cache_delete}),
256 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100257 'help|h' => sub {
258 pod2usage(
259 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200260 -verbose => 99,
261 -msg => $VERSION_MSG,
262 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100263 );
264 },
265 'version|v' => sub {
266 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200267 -verbose => 0,
268 -msg => $VERSION_MSG,
269 -output => '-'
Akronebbac2e2024-03-22 10:31:23 +0100270 ),
271 },
272 'job-count|jc' => sub {
273 my ($j, $msg) = count_jobs();
274 pod2usage(
275 -verbose => 0,
276 -msg => $msg,
277 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100278 )
279 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000280);
281
Akrone512b7c2020-08-07 16:16:12 +0200282my %ERROR_HASH = (
283 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
284 -verbose => 99,
285 -msg => $VERSION_MSG,
286 -output => '-',
287 -exit => 1
288);
Akron63f20d42017-04-10 23:40:29 +0200289
Akronf8df2162020-08-07 15:03:39 +0200290# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200291if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200292 my %config;
293
Akronf8df2162020-08-07 15:03:39 +0200294 print "Reading config from $cfg_file\n";
295
Akron636aa112017-04-07 18:48:56 +0200296 Config::Simple->import_from($cfg_file, \%config);
297
Akronf8df2162020-08-07 15:03:39 +0200298 foreach (qw!output cache-size input-base token overwrite
299 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200300 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100301 non-verbal-tokens sequential-extraction
Akronc0ac4ff2024-04-15 18:03:15 +0200302 temporary-extract cache-init cache-delete
Akrona3518372024-01-22 23:29:00 +0100303 koral extract-dir jobs quiet!) {
Akronf8df2162020-08-07 15:03:39 +0200304 my $underlined = $_ =~ tr/-/_/r;
305 if (!defined($cfg{$underlined}) && defined $config{$_}) {
306 $cfg{$underlined} = $config{$_};
307 };
Akron636aa112017-04-07 18:48:56 +0200308 };
309
310 # Skip
311 if (!scalar(@skip) && defined $config{'skip'}) {
312 @skip = split /\s*;\s*/, $config{'skip'} ;
313 };
314
315 # Sigle
316 if (!scalar(@sigle) && defined $config{'sigle'}) {
317 @sigle = split /\s*;\s*/, $config{'sigle'} ;
318 };
319
320 # Anno
321 if (!scalar(@anno) && defined $config{'anno'}) {
322 @anno = split /\s*;\s*/, $config{'anno'} ;
323 };
324};
325
Akronf8df2162020-08-07 15:03:39 +0200326# Init variables and set default values
327my $output = $cfg{output};
328my $input_base = $cfg{input_base};
329my $gzip = $cfg{gzip};
330my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100331my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200332my $token_base = $cfg{token} // 'OpenNLP#tokens';
333my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
334my $jobs = $cfg{jobs} // 0;
335my $cache_delete = $cfg{cache_delete} // 1;
336my $base_sentences = lc($cfg{base_sentences} // '');
337my $base_paragraphs = lc($cfg{base_paragraphs} // '');
338my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
339my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akrona3518372024-01-22 23:29:00 +0100340my $q = !!($cfg{quiet}) // 0;
Akron63f20d42017-04-10 23:40:29 +0200341
Akronf8df2162020-08-07 15:03:39 +0200342# Get tokenization basis
343my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200344
Akronf8df2162020-08-07 15:03:39 +0200345# Remove file extension
346$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100347
Akronf8df2162020-08-07 15:03:39 +0200348# Convert sigle to path construct
349s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
350
351my %skip;
352$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200353
Akronb9c33812020-10-21 16:19:35 +0200354Log::Any::Adapter->set(
355 'Stderr', log_level => uc($cfg{log} // 'ERROR')
356);
Akron63f20d42017-04-10 23:40:29 +0200357
Akron84b53ad2022-01-14 12:39:15 +0100358# Start log slimming
359if ($cmd && $cmd eq 'slimlog') {
360 require KorAP::XML::Log::Slim;
361
362 my $log_file = shift @ARGV;
363
364 if (-e $log_file) {
365
366 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
367
368 # Run log filter
369 $slimmer->slim_to;
370 }
371
372 else {
373 warn "Log file can't be found";
374 exit(1);
375 };
376
377 exit;
378};
379
380
Akronf8df2162020-08-07 15:03:39 +0200381if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
382 $log->error("Directory '$output' does not exist.");
383 exit 1;
384};
Akron63f20d42017-04-10 23:40:29 +0200385
Akron941c1a62016-02-23 17:41:41 +0100386# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100387pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000388
Akrone1dbc382016-07-08 22:24:52 +0200389# Gzip has no effect, if no output is given
390pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000391
Akron63f20d42017-04-10 23:40:29 +0200392# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200393if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200394
Akron63f20d42017-04-10 23:40:29 +0200395 # Remove all inputs
396 my $remove_next = 0;
397 @keep_argv = @{c(@keep_argv)->grep(
398 sub {
399 # Input flag
400 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
401 $remove_next = 1;
402 return 0;
403 }
404
405 # input value
406 elsif ($remove_next) {
407 $remove_next = 0;
408 return 0;
409 };
410
411 # Pass parameter
412 return 1;
413 }
414 )->to_array};
415
416
417 # Iterate over all inputs
418 foreach (@input) {
419
Akron081639e2017-04-21 19:01:39 +0200420 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200421 my $new_out = catdir($output, get_file_name_from_glob($_));
422
Akron486f9ab2017-04-22 23:25:19 +0200423 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200424 unless ($to_tar) {
425 if (make_path($new_out) == 0 && !-d $new_out) {
426 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200427 exit 1;
Akron081639e2017-04-21 19:01:39 +0200428 };
Akron63f20d42017-04-10 23:40:29 +0200429 };
430
431 # Create archive command
432 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
Akrona3518372024-01-22 23:29:00 +0100433 unless ($q) {
434 print "Start serial processing of $_ to $new_out\n";
435 print 'Command: ', join(' ', @archive_cmd), "\n";
436 };
Akron63f20d42017-04-10 23:40:29 +0200437
438 # Start archiving
439 system @archive_cmd;
440 };
441
Akron3abc03e2017-06-29 16:23:35 +0200442 exit;
Akron63f20d42017-04-10 23:40:29 +0200443};
444
Akron5c602cb2020-08-07 17:00:52 +0200445# Define supported (and preinstalled) transformation modules
446my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100447push(@layers, ['Base', 'Sentences']) unless $base_sentences;
448push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200449
450# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200451push(@layers, ['Connexor', 'Morpho'],
452 ['Connexor', 'Syntax'],
453 ['Connexor', 'Phrase'],
454 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200455
456# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200457push(@layers,
458 ['CoreNLP', 'NamedEntities'],
459 ['CoreNLP', 'Sentences'],
460 ['CoreNLP', 'Morpho'],
461 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200462
Akron5530a552022-02-17 17:53:15 +0100463# CorpusExplorer
464push(@layers,
465 ['CorpusExplorer', 'Morpho']);
466
Akronce125b62017-06-19 11:54:36 +0200467# CMC
468push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100469
Akrone1dbc382016-07-08 22:24:52 +0200470# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100471my @dereko_attr = ();
472if ($base_sentences eq 'dereko#structure') {
473 push @dereko_attr, 'sentences';
474};
475if ($base_paragraphs eq 'dereko#structure') {
476 push @dereko_attr, 'paragraphs';
477};
Akron636bd9c2017-02-09 17:13:00 +0100478
Akron41ac10b2017-02-08 22:47:25 +0100479if ($base_pagebreaks eq 'dereko#structure') {
480 push @dereko_attr, 'pagebreaks';
481};
482
483if ($dereko_attr[0]) {
484 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100485}
486else {
487 push(@layers, ['DeReKo', 'Structure']);
488};
Akrone1dbc382016-07-08 22:24:52 +0200489
Akron57510c12019-01-04 14:58:53 +0100490# DGD
491push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100492if ($base_sentences eq 'dgd#structure') {
493 push(@layers, ['DGD', 'Structure', 'base-sentence']);
494}
Akron57510c12019-01-04 14:58:53 +0100495
496# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200497push(@layers,
498 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100499
Akronabb36902021-10-11 15:51:06 +0200500# Gingko
501push(@layers,
502 ['Gingko', 'Morpho']);
503
Akrone1dbc382016-07-08 22:24:52 +0200504# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200505push(@layers,
506 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200507
Akronea1aed52018-07-19 14:43:34 +0200508# HNC
Akron5c602cb2020-08-07 17:00:52 +0200509push(@layers,
510 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200511
Akron4c679192018-01-16 17:41:49 +0100512# LWC
Akron5c602cb2020-08-07 17:00:52 +0200513push(@layers,
514 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100515
Akrone1dbc382016-07-08 22:24:52 +0200516# Malt
Akron5c602cb2020-08-07 17:00:52 +0200517push(@layers,
518 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200519
Akron57510c12019-01-04 14:58:53 +0100520# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200521push(@layers,
522 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200523
524# Mate
Akron5c602cb2020-08-07 17:00:52 +0200525push(@layers,
526 ['Mate', 'Morpho'],
527 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200528
Akron57510c12019-01-04 14:58:53 +0100529# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200530push(@layers,
531 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100532
Akron88d063a2022-03-21 15:10:01 +0100533# NKJP
534push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200535 ['NKJP', 'Morpho'],
536 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100537
Akrone1dbc382016-07-08 22:24:52 +0200538# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200539push(@layers,
540 ['OpenNLP', 'Morpho'],
541 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200542
Akron07e24772020-04-23 14:00:54 +0200543# Redewiedergabe
544push(@layers, ['RWK', 'Morpho']);
545if ($base_sentences eq 'rwk#structure') {
546 push(@layers, ['RWK', 'Structure']);
547};
548
Akrone1dbc382016-07-08 22:24:52 +0200549# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200550push(@layers,
551 ['Sgbr', 'Lemma'],
552 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200553
Marc Kupietzb8c53822024-03-16 18:54:08 +0100554# Spacy
555push(@layers,
556 ['Spacy', 'Morpho']);
557
Akron7d5e6382019-08-08 16:36:27 +0200558# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200559push(@layers,
560 ['Talismane', 'Dependency'],
561 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200562
Akrone1dbc382016-07-08 22:24:52 +0200563# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200564push(@layers,
565 ['TreeTagger', 'Morpho'],
566 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200567
Marc Kupietz400590b2022-12-23 16:02:36 +0100568# UDPipe
569push(@layers,
570 ['UDPipe', 'Morpho'],
571 ['UDPipe', 'Dependency']);
572
Akrone1dbc382016-07-08 22:24:52 +0200573# XIP
Akron5c602cb2020-08-07 17:00:52 +0200574push(@layers,
575 ['XIP', 'Morpho'],
576 ['XIP', 'Constituency'],
577 ['XIP', 'Sentences'],
578 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200579
Akron4fa37c32017-01-20 14:43:10 +0100580
Akrone1dbc382016-07-08 22:24:52 +0200581# Check filters
582my @filtered_anno;
583if ($skip{'#all'}) {
584 foreach (@anno) {
585 push @filtered_anno, [ split('#', $_) ];
586 };
587}
588
589# Add all annotations that are not skipped
590else {
591 # Add to index file - respect skipping
592 foreach my $info (@layers) {
593 # Skip if Foundry or Foundry#Layer should be skipped
594 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
595 push @filtered_anno, $info;
596 };
597 };
598};
599
Akrone1dbc382016-07-08 22:24:52 +0200600
601# TODO: This should not be initialized for batch
602my $cache = Cache::FastMmap->new(
603 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200604 cache_size => ($cfg{cache_size} // '50m'),
Akronc0ac4ff2024-04-15 18:03:15 +0200605 init_file => ($cfg{cache_init} // 1),
606 unlink_on_exit => $cache_delete
Akrone1dbc382016-07-08 22:24:52 +0200607);
608
Akron03b24db2016-08-16 20:54:32 +0200609# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200610my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200611 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200612 meta_type => $cfg{meta},
613 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200614 foundry => $token_base_foundry,
615 layer => $token_base_layer,
616 gzip => $gzip,
617 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200618 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100619 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200620 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200621 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
622 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200623);
624
Akrone512b7c2020-08-07 16:16:12 +0200625# Auto adjust jobs
626if ($jobs eq '-1') {
Akronebbac2e2024-03-22 10:31:23 +0100627 ($jobs, my $msg) = count_jobs();
628 print $msg . "\n" unless $q;
Akrone512b7c2020-08-07 16:16:12 +0200629};
630
Akron63f20d42017-04-10 23:40:29 +0200631# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200632if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200633
Akron821db3d2017-04-06 21:19:31 +0200634 my @new_input = ();
635
636 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200637 foreach my $wild_card (@input) {
638
639 # Prefix with input root
640 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
641
642 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200643 };
644
Akron63f20d42017-04-10 23:40:29 +0200645 # Sort files by length
646 @input = sort { length($a) <=> length($b) } @new_input;
647
Akrona3518372024-01-22 23:29:00 +0100648 print 'Input is ' . join(', ', @input)."\n" unless $q;
Akron821db3d2017-04-06 21:19:31 +0200649};
650
Akron941c1a62016-02-23 17:41:41 +0100651# Process a single file
652unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100653 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000654
Akron941c1a62016-02-23 17:41:41 +0100655 BEGIN {
656 $main::TIME = Benchmark->new;
657 $main::LAST_STOP = Benchmark->new;
658 };
659
660 sub stop_time {
661 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200662 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100663 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200664 timestr(timediff($new, $main::LAST_STOP)) .
665 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
666 );
Akron941c1a62016-02-23 17:41:41 +0100667 $main::LAST_STOP = $new;
668 };
669
670 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200671 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100672
Akron7d4cdd82016-08-17 21:39:45 +0200673 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200674 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100675
Akron5f51d422016-08-16 16:26:43 +0200676 stop_time;
Akronc0ac4ff2024-04-15 18:03:15 +0200677
Akron3abc03e2017-06-29 16:23:35 +0200678 exit;
Akron81500102017-04-07 20:45:44 +0200679};
680
Nils Diewald59094f22014-11-05 18:20:50 +0000681
Akrone10ad322016-02-27 10:54:26 +0100682# Extract XML files
Akron81500102017-04-07 20:45:44 +0200683if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100684
Akrond5643ad2017-07-04 20:27:13 +0200685 # Output is required
686 pod2usage(%ERROR_HASH) unless $output;
687
Akron7d4cdd82016-08-17 21:39:45 +0200688 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200689 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100690
Akron7d4cdd82016-08-17 21:39:45 +0200691 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100692 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200693 $log->error("Unzip is not installed or incompatible.");
694 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100695 };
696
Akronb0c88db2016-06-29 16:33:18 +0200697 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200698 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200699
Akron31a08cb2019-02-20 20:43:26 +0100700 # Will set @sigle
701 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200702
Akrone10ad322016-02-27 10:54:26 +0100703 # Iterate over all given sigles and extract
704 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100705
Akrona3518372024-01-22 23:29:00 +0100706 unless ($q) {
707 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200708
Akrona3518372024-01-22 23:29:00 +0100709 # TODO: Make this OS independent
710 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100711
Akrona3518372024-01-22 23:29:00 +0100712 # TODO:
713 # - prefix???
714 $archive->extract_sigle(0, [$_], $output, $jobs)
715 ? '' : 'not '
716 );
717 print "extracted.\n";
718 } else {
Akroncb12af72025-07-15 14:36:10 +0200719 $archive->extract_sigle($q, [$_], $output, $jobs);
Akrona3518372024-01-22 23:29:00 +0100720 }
Akrone10ad322016-02-27 10:54:26 +0100721 };
Akronb0c88db2016-06-29 16:33:18 +0200722 }
Akron7d4cdd82016-08-17 21:39:45 +0200723
724 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200725 else {
726 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200727 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100728 };
729}
730
Akron81500102017-04-07 20:45:44 +0200731
Akron941c1a62016-02-23 17:41:41 +0100732# Process an archive
733elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000734
Akron81500102017-04-07 20:45:44 +0200735 my $archive_output;
736
737 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100738 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200739
740 # Create new archive object
741 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
742
743 # Check zip capabilities
744 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200745 $log->error("Unzip is not installed or incompatible.");
746 exit 1;
Akron81500102017-04-07 20:45:44 +0200747 };
748
749 # Add further annotation archived
750 $archive->attach($_) foreach @input[1..$#input];
751
752 # Create a temporary directory
753 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200754 $extract_dir = tempdir(CLEANUP => 0);
Akrona3518372024-01-22 23:29:00 +0100755 print "Temporarily extract to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200756 };
757
Akron63f20d42017-04-10 23:40:29 +0200758 # Add some random extra to avoid clashes with multiple archives
759 $extract_dir = catdir($extract_dir, random_string('cccccc'));
760
Akron31a08cb2019-02-20 20:43:26 +0100761 # Extract to temporary directory
Akrona3518372024-01-22 23:29:00 +0100762 if ($archive->extract_all($q, $extract_dir, $sequential_extraction ? 1: $jobs)) {
763 print "Extract sequentially to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200764 @input = ($extract_dir);
765 }
766 else {
767 $log->error('Unable to extract from primary archive ' . $input[0] .
768 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200769 exit 1;
Akron81500102017-04-07 20:45:44 +0200770 };
771 }
772
773 # Can't create archive object
774 else {
775 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200776 exit 1;
Akron81500102017-04-07 20:45:44 +0200777 };
778 };
779
Akron7d4cdd82016-08-17 21:39:45 +0200780 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100781 my $pool = Parallel::ForkManager->new($jobs);
782
Akron7d4cdd82016-08-17 21:39:45 +0200783 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100784 my $iter = 1; # Current text in process
785
Akronda3097e2017-04-23 19:53:57 +0200786 my $tar_archive;
787 my $output_dir = $output;
788 my $tar_fh;
Akroncb12af72025-07-15 14:36:10 +0200789 my $final_tar_file;
790 my %tar_pool;
791 my $next_tar = 1; # Counter for tar assignment
Akronda3097e2017-04-23 19:53:57 +0200792
793 # Initialize tar archive
794 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200795 # Set output name
Akroncb12af72025-07-15 14:36:10 +0200796 $final_tar_file = $output;
797 unless ($final_tar_file =~ /\.tar$/) {
798 $final_tar_file .= '.tar';
Akronda3097e2017-04-23 19:53:57 +0200799 };
800
Akroncb12af72025-07-15 14:36:10 +0200801 print "Writing to file $final_tar_file\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200802
Akroncb12af72025-07-15 14:36:10 +0200803 # Create tar pool with size equal to number of jobs
804 # If jobs is 0, create just one tar file
805 my $pool_size = $jobs > 0 ? $jobs : 1;
806 for my $i (1..$pool_size) {
807 my ($fh, $temp_tar) = tempfile(
808 "korapxml2krill_pool_${i}_XXXX",
809 SUFFIX => '.tar',
810 TMPDIR => 1
Akroneb370a02022-02-24 13:33:40 +0100811 );
812
Akroncb12af72025-07-15 14:36:10 +0200813 $tar_pool{$i} = {
814 fh => $fh,
815 file => $temp_tar,
816 };
Akroneb370a02022-02-24 13:33:40 +0100817
Akroncb12af72025-07-15 14:36:10 +0200818 if (eval("use Archive::Tar::Builder; 1;")) {
819 ($tar_pool{$i}->{archive} = Archive::Tar::Builder->new(ignore_errors => 1))->set_handle($fh);
820 } else {
821 $tar_pool{$i}->{archive} = KorAP::XML::TarBuilder->new($fh);
822 }
Akroneb370a02022-02-24 13:33:40 +0100823 };
Akronda3097e2017-04-23 19:53:57 +0200824
825 # Output to temporary directory
826 $output_dir = File::Temp->newdir;
827 };
828
Akron941c1a62016-02-23 17:41:41 +0100829 # Report on fork message
830 $pool->run_on_finish (
831 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200832 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100833 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200834
Akrona3518372024-01-22 23:29:00 +0100835 unless ($q) {
836 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
837 $iter . "/$count]" .
838 ($code ? " $code" : '') .
839 ' ' . $data->[0] . "\n";
840 };
841 $iter++;
Akronda3097e2017-04-23 19:53:57 +0200842
843 if (!$code && $to_tar && $data->[2]) {
844 my $filename = $data->[2];
Akroncb12af72025-07-15 14:36:10 +0200845 my $clean_file = fileparse($filename);
Akronda3097e2017-04-23 19:53:57 +0200846
Akroncb12af72025-07-15 14:36:10 +0200847 # Get next available tar file in round-robin fashion
848 my $pool_size = $jobs > 0 ? $jobs : 1;
849 my $pool_idx = $next_tar;
850 $next_tar = ($next_tar % $pool_size) + 1;
Akronda3097e2017-04-23 19:53:57 +0200851
Akroncb12af72025-07-15 14:36:10 +0200852 my $tar = $tar_pool{$pool_idx};
Akron9a062ce2017-07-04 19:12:05 +0200853
Akroncb12af72025-07-15 14:36:10 +0200854 # Lock the tar file before writing
855 flock($tar->{fh}, LOCK_EX);
Akronda3097e2017-04-23 19:53:57 +0200856
Akroncb12af72025-07-15 14:36:10 +0200857 # Add file to pool tar
858 $tar->{archive}->archive_as($filename => $clean_file);
859
860 # Release lock
861 flock($tar->{fh}, LOCK_UN);
862
863 unlink $filename;
Akronda3097e2017-04-23 19:53:57 +0200864 };
865
Akron4c0cf312016-10-15 16:42:09 +0200866 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100867 }
868 );
869
870 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200871 my $temp;
Akrona3518372024-01-22 23:29:00 +0100872 print "Reading data ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100873
Akron7d4cdd82016-08-17 21:39:45 +0200874 # unless (Cache::FastMmap->new(
875 # share_file => $cache_file,
876 # cache_size => $cache_size,
877 # init_file => $cache_init
878 # )) {
879 # print "Unable to intialize cache '$cache_file'\n\n";
880 # exit(1);
881 # };
Akron11c80302016-03-18 19:44:43 +0100882
Akron486f9ab2017-04-22 23:25:19 +0200883
Akron941c1a62016-02-23 17:41:41 +0100884 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100885 if (-d $input[0]) {
Akron941c1a62016-02-23 17:41:41 +0100886
Akron8b03ba52025-07-15 09:16:18 +0200887 # First pass: count files
888 my $rule_count = Path::Iterator::Rule->new;
889 $rule_count->name('data.xml')->file;
890 my $count_iter = $rule_count->iter(
891 $input[0] => {
892 sorted => 0,
893 depthfirst => -1,
894 error_handler => undef
895 });
896 $count = 0;
897 while (defined(my $file = $count_iter->())) {
898 $count++;
899 };
Akronce033502024-09-11 10:51:49 +0200900
Akron8b03ba52025-07-15 09:16:18 +0200901 print "Start processing ...\n" unless $q;
902 $t = Benchmark->new;
903
904 # Second pass: process files using iterator
Akron0a0d1f92024-11-14 14:31:42 +0100905 my $rule = Path::Iterator::Rule->new;
906 $rule->name('data.xml')->file;
907 my $next = $rule->iter(
908 $input[0] => {
909 sorted => 0,
910 depthfirst => -1,
911 error_handler => undef
912 });
Akron941c1a62016-02-23 17:41:41 +0100913
914 DIRECTORY_LOOP:
Akron8b03ba52025-07-15 09:16:18 +0200915 while (defined(my $file = $next->())) {
916 # Remove data.xml suffix to get directory path
917 $file =~ s/\/data\.xml$//;
Akron941c1a62016-02-23 17:41:41 +0100918
Akrone1dbc382016-07-08 22:24:52 +0200919 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200920 $output_dir,
Akron8b03ba52025-07-15 09:16:18 +0200921 get_file_name($input[0], $file) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200922 );
Akron941c1a62016-02-23 17:41:41 +0100923
924 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200925 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200926
Akron8b03ba52025-07-15 09:16:18 +0200927 if (my $return = $batch_file->process($file => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200928 $pool->finish(
929 0,
Akronda3097e2017-04-23 19:53:57 +0200930 [
931 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
932 undef,
933 $filename
934 ]
Akron486f9ab2017-04-22 23:25:19 +0200935 );
Akron3ec48972016-08-17 23:24:52 +0200936 }
937 else {
Akron8b03ba52025-07-15 09:16:18 +0200938 $pool->finish(1, ["Unable to process " . $file]);
Akron3ec48972016-08-17 23:24:52 +0200939 };
Akron941c1a62016-02-23 17:41:41 +0100940 };
941 }
942
943 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200944 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200945
Akron941c1a62016-02-23 17:41:41 +0100946 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200947 $log->error("Unzip is not installed or incompatible.");
948 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100949 };
950
Akron08385f62016-03-22 20:37:04 +0100951 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200952 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100953
Akron31a08cb2019-02-20 20:43:26 +0100954 # Get sigles to extract
955 my $prefix = set_sigle($archive);
956
Akrona3518372024-01-22 23:29:00 +0100957 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100958 $t = Benchmark->new;
Akron8b03ba52025-07-15 09:16:18 +0200959
960 # Get count of texts
961 $count = $archive->count_texts;
962
963 # Get iterator for text paths
964 my $text_iter = $archive->list_texts_iterator;
965
966 # Process texts one at a time using the iterator
967 ARCHIVE_LOOP:
968 while (defined(my $text_path = $text_iter->())) {
Akron941c1a62016-02-23 17:41:41 +0100969 # Split path information
Akron8b03ba52025-07-15 09:16:18 +0200970 my ($prefix, $corpus, $doc, $text) = $archive->split_path($text_path);
Akron941c1a62016-02-23 17:41:41 +0100971
Akrone1dbc382016-07-08 22:24:52 +0200972 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200973 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200974 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200975 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200976 catfile($corpus, $doc, $text)
977 . '.json' . ($gzip ? '.gz' : '')
978 )
Akrone1dbc382016-07-08 22:24:52 +0200979 );
Akron941c1a62016-02-23 17:41:41 +0100980
981 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200982 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100983
Akron4c0cf312016-10-15 16:42:09 +0200984 # Create temporary file
985 $temp = File::Temp->newdir;
986
Akronbdf434a2016-10-24 17:42:07 +0200987 # TODO: Check if $filename exist at the beginning,
988 # because extraction can be horrible slow!
989
Akron941c1a62016-02-23 17:41:41 +0100990 # Extract from archive
Akrona3518372024-01-22 23:29:00 +0100991 if ($archive->extract_sigle($q, [join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100992
Akron7d4cdd82016-08-17 21:39:45 +0200993 # Create corpus directory
994 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100995
Akron7d4cdd82016-08-17 21:39:45 +0200996 # Temporary directory
997 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100998
Akron7d4cdd82016-08-17 21:39:45 +0200999 # Write file
Akron13d56622016-10-31 14:54:49 +01001000 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +02001001
Akron4c0cf312016-10-15 16:42:09 +02001002 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +01001003 $pool->finish(
1004 0,
Akronda3097e2017-04-23 19:53:57 +02001005 [
1006 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
1007 $temp,
1008 $filename
1009 ]
Akron13d56622016-10-31 14:54:49 +01001010 );
Akron7d4cdd82016-08-17 21:39:45 +02001011 }
1012 else {
Akron4c0cf312016-10-15 16:42:09 +02001013 # Delete temporary file
1014 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001015 };
Akron941c1a62016-02-23 17:41:41 +01001016 }
Akron7d4cdd82016-08-17 21:39:45 +02001017
1018 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001019 else {
Akron8b03ba52025-07-15 09:16:18 +02001020 $pool->finish(1, ["Unable to extract " . $text_path, $temp]);
Akron941c1a62016-02-23 17:41:41 +01001021 };
1022 };
1023 }
1024
1025 else {
Akrona3518372024-01-22 23:29:00 +01001026 print "Input is neither a directory nor an archive.\n\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +01001027 };
1028
1029 $pool->wait_all_children;
1030
Akroncb12af72025-07-15 14:36:10 +02001031 # Merge all temporary tar files into final tar if needed
1032 if ($to_tar && %tar_pool) {
1033 $| = 1;
1034 print "Merging " . scalar(keys %tar_pool) . " temporary tar files...\n" unless $q;
1035
1036 # Open final tar file
1037 my $final_fh = IO::File->new($final_tar_file, 'w') or die "Cannot open $final_tar_file: $!";
1038 $final_fh->binmode(1);
1039
1040 # Create final archive
1041 my $final_archive;
1042
1043 if (eval("use Archive::Tar::Builder; 1;")) {
1044 $final_archive = Archive::Tar::Builder->new(ignore_errors => 1);
1045 $final_archive->set_handle($final_fh);
1046 } else {
1047 $final_archive = KorAP::XML::TarBuilder->new($final_fh);
1048 }
1049
1050 # Finish and close all pool tar files
1051 foreach my $pool_idx (sort keys %tar_pool) {
1052 my $tar = $tar_pool{$pool_idx};
1053 $tar->{archive}->finish;
1054 $tar->{fh}->close;
1055
1056 # Append temp tar content to final tar using efficient buffered copy
1057 open my $temp_fh, '<:raw', $tar->{file} or die "Cannot open temp tar $tar->{file}: $!";
1058 my $buffer_size = 1024 * 1024; # 1MB buffer
1059 my $buffer;
1060 while (my $bytes_read = read($temp_fh, $buffer, $buffer_size)) {
1061 my $bytes_written = 0;
1062 while ($bytes_written < $bytes_read) {
1063 my $written = syswrite($final_fh, $buffer, $bytes_read - $bytes_written, $bytes_written);
1064 die "Write error: $!" unless defined $written;
1065 $bytes_written += $written;
1066 }
1067 }
1068 close $temp_fh;
1069
1070 # Clean up temp tar
1071 unlink $tar->{file};
1072 }
1073
1074 # Close final tar
1075 $final_archive->finish;
1076 $final_fh->close;
1077 print "Wrote to tar archive $final_tar_file\n" unless $q;
1078 }
1079
Akrona3518372024-01-22 23:29:00 +01001080 unless ($q) {
1081 print timestr(timediff(Benchmark->new, $t))."\n";
1082 print "Done.\n";
1083 };
Akron81500102017-04-07 20:45:44 +02001084};
Akron941c1a62016-02-23 17:41:41 +01001085
Nils Diewald2db9ad02013-10-29 19:26:43 +00001086
Akron31a08cb2019-02-20 20:43:26 +01001087# For an archive, this will create the list
1088# of all sigles to process
1089sub set_sigle {
1090 my $archive = shift;
1091
1092 my $prefix = 1;
1093 my @dirs = ();
1094
1095 # No sigles given
1096 unless (@sigle) {
1097
1098 # Get files
1099 foreach ($archive->list_texts) {
1100
1101 push @dirs, $_;
1102
1103 # Split path information
1104 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1105
1106 # TODO: Make this OS independent
1107 push @sigle, join '/', $corpus, $doc, $text;
1108 };
1109 }
1110
1111 # Check sigle for doc sigles
1112 else {
1113 my @new_sigle;
1114
1115 my $prefix_check = 0;
1116
1117 # Iterate over all sigle
1118 foreach (@sigle) {
1119
1120 # Sigle is a doc sigle
1121 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1122
Akrona3518372024-01-22 23:29:00 +01001123 print "$_ ..." unless $q;
Akron31a08cb2019-02-20 20:43:26 +01001124 # Check if a prefix is needed
1125 unless ($prefix_check) {
1126
Akrona3518372024-01-22 23:29:00 +01001127 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001128 print " with prefix ...";
1129 };
1130 $prefix_check = 1;
1131 };
1132
Akrona3518372024-01-22 23:29:00 +01001133 unless ($q) {
1134 print "\n";
Akron31a08cb2019-02-20 20:43:26 +01001135
Akrona3518372024-01-22 23:29:00 +01001136 print '... ' . (
1137 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
Akron955b75b2019-02-21 14:28:41 +01001138 ? '' : 'not '
Akrona3518372024-01-22 23:29:00 +01001139 );
1140 print "extracted.\n";
1141 }
1142 else {
1143 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
1144 };
Akron31a08cb2019-02-20 20:43:26 +01001145 }
1146
1147 # Sigle is a text sigle
1148 else {
1149 push @new_sigle, $_;
1150
1151 unless ($prefix_check) {
1152
Akrona3518372024-01-22 23:29:00 +01001153 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001154 print " with prefix ...";
1155 };
1156 $prefix_check = 1;
1157 };
1158 };
1159 };
1160 @sigle = @new_sigle;
1161 };
1162
1163 return $prefix;
1164};
1165
1166
Akron63f20d42017-04-10 23:40:29 +02001167# Cleanup temporary extraction directory
1168if ($extract_dir) {
1169 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001170 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001171};
1172
1173
1174print "\n";
1175
Nils Diewald2db9ad02013-10-29 19:26:43 +00001176__END__
Akron941c1a62016-02-23 17:41:41 +01001177
1178=pod
1179
1180=encoding utf8
1181
1182=head1 NAME
1183
Akron42f48c12020-02-14 13:08:13 +01001184korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001185
1186
1187=head1 SYNOPSIS
1188
Akron9cb8c982024-03-22 10:46:56 +01001189 $ korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001190
Akron2fd402b2016-10-27 21:26:48 +02001191
Akron941c1a62016-02-23 17:41:41 +01001192=head1 DESCRIPTION
1193
1194L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1195compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001196The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001197
1198
1199=head1 INSTALLATION
1200
1201The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1202
Akron9cb8c982024-03-22 10:46:56 +01001203 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001204
Akronc13a1702016-03-15 19:33:14 +01001205In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001206be available on your command line immediately.
Akron8ce23f72023-12-13 15:48:49 +01001207Minimum requirement for L<KorAP::XML::Krill> is Perl 5.32.
Akroneb370a02022-02-24 13:33:40 +01001208Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1209Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001210In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001211
1212=head1 ARGUMENTS
1213
Akron9cb8c982024-03-22 10:46:56 +01001214 $ korapxml2krill -z --input <directory> --output <filename>
Akrona76d8352016-10-27 16:27:32 +02001215
1216Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001217It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001218
Akron941c1a62016-02-23 17:41:41 +01001219=over 2
1220
1221=item B<archive>
1222
Akron9cb8c982024-03-22 10:46:56 +01001223 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001224
Akron2fd402b2016-10-27 21:26:48 +02001225Converts an archive of KorAP-XML documents. It expects a directory
1226(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001227
1228=item B<extract>
1229
Akron9cb8c982024-03-22 10:46:56 +01001230 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Akrona76d8352016-10-27 16:27:32 +02001231
1232Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001233
Akron63f20d42017-04-10 23:40:29 +02001234=item B<serial>
1235
Akron9cb8c982024-03-22 10:46:56 +01001236 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Akron63f20d42017-04-10 23:40:29 +02001237
Akronce033502024-09-11 10:51:49 +02001238Convert archives in serial. The inputs are not merged but treated
Akron63f20d42017-04-10 23:40:29 +02001239as they are (so they may be premerged or globs).
1240the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001241are created based on the archive name. In case the C<--to-tar> flag is given,
1242the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001243
1244
Akron84b53ad2022-01-14 12:39:15 +01001245=item B<slimlog>
1246
Akron9cb8c982024-03-22 10:46:56 +01001247 $ korapxml2krill slimlog <logfile> > <logfile-slim>
Akron84b53ad2022-01-14 12:39:15 +01001248
1249Filters out all useless aka succesfull information from logs, to simplify
1250log checks. Expects no further options.
1251
1252
Akron941c1a62016-02-23 17:41:41 +01001253=back
1254
1255
1256=head1 OPTIONS
1257
1258=over 2
1259
Akrona76d8352016-10-27 16:27:32 +02001260=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001261
Akrona76d8352016-10-27 16:27:32 +02001262Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001263
Akron7606afa2016-10-25 16:23:49 +02001264Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001265document, while C<archive> expects a KorAP-XML corpus folder or a zip
1266file to batch process multiple files.
1267C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001268
Akrondee3cf62024-06-14 18:14:48 +02001269C<archive> supports multiple input zip files with the constraint
Akron2cfe8092016-06-24 17:48:49 +02001270that the first archive listed contains all primary data files
1271and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001272
Akron7606afa2016-10-25 16:23:49 +02001273 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001274
Akron821db3d2017-04-06 21:19:31 +02001275Input may also be defined using BSD glob wildcards.
1276
1277 -i 'file/news*.zip'
1278
1279The extended input array will be sorted in length order, so the shortest
1280path needs to contain all primary data files and all meta data files.
1281
Akrondee3cf62024-06-14 18:14:48 +02001282(The directory structure follows the base directory format
Akron0c3e3752016-06-28 15:55:53 +02001283that may include a C<.> root folder.
1284In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001285need to be passed with a hash sign in front of the archive's name.
1286This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001287
Akron7606afa2016-10-25 16:23:49 +02001288To support zip files, a version of C<unzip> needs to be installed that is
1289compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001290
Akron7606afa2016-10-25 16:23:49 +02001291B<The root folder switch using the hash sign is experimental and
1292may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001293
Akronf73ffb62018-06-27 12:13:59 +02001294
Akron63f20d42017-04-10 23:40:29 +02001295=item B<--input-base|-ib> <directory>
1296
1297The base directory for inputs.
1298
1299
Akron941c1a62016-02-23 17:41:41 +01001300=item B<--output|-o> <directory|file>
1301
1302Output folder for archive processing or
1303document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001304writes to C<STDOUT> by default
1305(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001306
1307=item B<--overwrite|-w>
1308
1309Overwrite files that already exist.
1310
Akronf73ffb62018-06-27 12:13:59 +02001311
Akron3741f8b2016-12-21 19:55:21 +01001312=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001313
1314Define the default tokenization by specifying
1315the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001316of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001317This will directly take the file instead of running
1318the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001319
Akron3741f8b2016-12-21 19:55:21 +01001320
1321=item B<--base-sentences|-bs> <foundry>#<layer>
1322
1323Define the layer for base sentences.
1324If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001325Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1326layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001327
1328 Defaults to unset.
1329
1330
1331=item B<--base-paragraphs|-bp> <foundry>#<layer>
1332
1333Define the layer for base paragraphs.
1334If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001335Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1336layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001337
1338 Defaults to unset.
1339
1340
Akron41ac10b2017-02-08 22:47:25 +01001341=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1342
1343Define the layer for base pagebreaks.
1344Currently C<DeReKo#Structure> is the only layer supported.
1345
1346 Defaults to unset.
1347
1348
Akron941c1a62016-02-23 17:41:41 +01001349=item B<--skip|-s> <foundry>[#<layer>]
1350
Akronf7ad89e2016-03-16 18:22:47 +01001351Skip specific annotations by specifying the foundry
1352(and optionally the layer with a C<#>-prefix),
1353e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001354Can be set multiple times.
1355
Akronf73ffb62018-06-27 12:13:59 +02001356
Akronc13a1702016-03-15 19:33:14 +01001357=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001358
Akronf7ad89e2016-03-16 18:22:47 +01001359Convert specific annotations by specifying the foundry
1360(and optionally the layer with a C<#>-prefix),
1361e.g. C<Mate> or C<Mate#Morpho>.
1362Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001363
Akronf73ffb62018-06-27 12:13:59 +02001364
Akroned9baf02019-01-22 17:03:25 +01001365=item B<--non-word-tokens|-nwt>
1366
1367Tokenize non-word tokens like word tokens (defined as matching
1368C</[\d\w]/>). Useful to treat punctuations as tokens.
1369
1370 Defaults to unset.
1371
Akronf1849aa2019-12-16 23:35:33 +01001372
1373=item B<--non-verbal-tokens|-nvt>
1374
1375Tokenize non-verbal tokens marked as in the primary data as
1376the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1377
1378 Defaults to unset.
1379
1380
Akron941c1a62016-02-23 17:41:41 +01001381=item B<--jobs|-j>
1382
Akron29128262024-04-17 15:50:36 +02001383Define the number of spawned forks for concurrent jobs
1384of archive processing.
Akron11c80302016-03-18 19:44:43 +01001385Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001386
Akrona472a242023-02-13 13:46:30 +01001387If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001388also apply to extraction.
1389
Akronebbac2e2024-03-22 10:31:23 +01001390Pass C<-1>, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001391times the number of available cores, in case L<Sys::Info>
Akronebbac2e2024-03-22 10:31:23 +01001392is available and can read CPU count (see C<--job-count>).
1393Be aware, that the report of available cores
Akron29128262024-04-17 15:50:36 +02001394may not work in certain conditions. Benchmarking the processing
1395speed based on the number of jobs may be valuable.
Akronebbac2e2024-03-22 10:31:23 +01001396
Akronf7ad89e2016-03-16 18:22:47 +01001397This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001398
Akronf73ffb62018-06-27 12:13:59 +02001399
Akronebbac2e2024-03-22 10:31:23 +01001400=item B<--job-count|-jc>
1401
1402Print job and core information that would be used if
1403C<-1> was passed to C<--jobs>.
1404
1405
Akron263274c2019-02-07 09:48:30 +01001406=item B<--koral|-k>
1407
1408Version of the output format. Supported versions are:
1409C<0> for legacy serialization, C<0.03> for serialization
1410with metadata fields as key-values on the root object,
1411C<0.4> for serialization with metadata fields as a list
1412of C<"@type":"koral:field"> objects.
1413
1414Currently defaults to C<0.03>.
1415
1416
Akron9ec88872017-04-12 16:29:06 +02001417=item B<--sequential-extraction|-se>
1418
1419Flag to indicate, if the C<jobs> value also applies to extraction.
1420Some systems may have problems with extracting multiple archives
1421to the same folder at the same time.
1422Can be flagged using C<--no-sequential-extraction> as well.
1423Defaults to C<false>.
1424
Akronf73ffb62018-06-27 12:13:59 +02001425
Akron35db6e32016-03-17 22:42:22 +01001426=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001427
Akron35db6e32016-03-17 22:42:22 +01001428Define the metadata parser to use. Defaults to C<I5>.
1429Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1430This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001431
Akronf73ffb62018-06-27 12:13:59 +02001432
Akron941c1a62016-02-23 17:41:41 +01001433=item B<--gzip|-z>
1434
Akronf7ad89e2016-03-16 18:22:47 +01001435Compress the output.
1436Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001437
Akronf73ffb62018-06-27 12:13:59 +02001438
Akron11c80302016-03-18 19:44:43 +01001439=item B<--cache|-c>
1440
1441File to mmap a cache (using L<Cache::FastMmap>).
1442Defaults to C<korapxml2krill.cache> in the calling directory.
1443
Akronf73ffb62018-06-27 12:13:59 +02001444
Akron11c80302016-03-18 19:44:43 +01001445=item B<--cache-size|-cs>
1446
1447Size of the cache. Defaults to C<50m>.
1448
Akronf73ffb62018-06-27 12:13:59 +02001449
Akron11c80302016-03-18 19:44:43 +01001450=item B<--cache-init|-ci>
1451
1452Initialize cache file.
1453Can be flagged using C<--no-cache-init> as well.
1454Defaults to C<true>.
1455
Akronf73ffb62018-06-27 12:13:59 +02001456
Akron11c80302016-03-18 19:44:43 +01001457=item B<--cache-delete|-cd>
1458
1459Delete cache file after processing.
1460Can be flagged using C<--no-cache-delete> as well.
1461Defaults to C<true>.
1462
Akronf73ffb62018-06-27 12:13:59 +02001463
Akron636aa112017-04-07 18:48:56 +02001464=item B<--config|-cfg>
1465
1466Configure the parameters of your call in a file
1467of key-value pairs with whitespace separator
1468
1469 overwrite 1
1470 token DeReKo#Structure
1471 ...
1472
1473Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001474C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akronc0ac4ff2024-04-15 18:03:15 +02001475C<token>, C<log>,
1476C<cache>, C<cache-size>, C<cache-init>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001477C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001478C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001479C<base-sentences>, C<base-paragraphs>,
1480C<base-pagebreaks>,
1481C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001482(semicolon separated), C<anno> (semicolon separated).
1483
Akronf73ffb62018-06-27 12:13:59 +02001484Configuration parameters will always be overwritten by
1485passed parameters.
1486
1487
Akron81500102017-04-07 20:45:44 +02001488=item B<--temporary-extract|-te>
1489
Akrona472a242023-02-13 13:46:30 +01001490Only valid for the C<archive> and C<serial>
1491commands.
Akron81500102017-04-07 20:45:44 +02001492
1493This will first extract all files into a
1494directory and then will archive.
1495If the directory is given as C<:temp:>,
1496a temporary directory is used.
1497This is especially useful to avoid
1498massive unzipping and potential
1499network latency.
Akron636aa112017-04-07 18:48:56 +02001500
Akronf73ffb62018-06-27 12:13:59 +02001501
Akronc93a0802019-07-11 15:48:34 +02001502=item B<--to-tar>
1503
1504Only valid for the C<archive> command.
1505
1506Writes the output into a tar archive.
1507
1508
Akrone10ad322016-02-27 10:54:26 +01001509=item B<--sigle|-sg>
1510
Akron20807582016-10-26 17:11:34 +02001511Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001512Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001513I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001514Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001515In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001516On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001517
Akron64f7fae2022-07-27 12:45:33 +02001518=item B<--lang>
1519
1520Preferred language for metadata fields. In case multiple titles are
1521given (on any level) with different C<xml:lang> attributes,
1522the language given is preferred.
1523Because titles may have different sources and different priorities,
1524non-specific language titles may still be preferred in case the title
1525source has a higher priority.
1526
Akronf73ffb62018-06-27 12:13:59 +02001527
Akron941c1a62016-02-23 17:41:41 +01001528=item B<--log|-l>
1529
Akronb9c33812020-10-21 16:19:35 +02001530The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001531
Akronf73ffb62018-06-27 12:13:59 +02001532
Akrona3518372024-01-22 23:29:00 +01001533=item B<--quiet>
1534
1535Silence all information (non-log) outputs.
1536
1537
Akron941c1a62016-02-23 17:41:41 +01001538=item B<--help|-h>
1539
Akron42f48c12020-02-14 13:08:13 +01001540Print help information.
Akron941c1a62016-02-23 17:41:41 +01001541
Akronf73ffb62018-06-27 12:13:59 +02001542
Akron941c1a62016-02-23 17:41:41 +01001543=item B<--version|-v>
1544
1545Print version information.
1546
1547=back
1548
Akron311e29b2024-09-11 11:46:09 +02001549=head1 PERFORMANCE
1550
1551There are some ways to improve performance for large tasks:
1552
1553=item First unpack
1554
1555Using the archive or serial command on one or multiple zip files
1556can be very slow, as it needs to unpack small portions every time.
1557It's better to use C<--temporary-extract> to unpack the whole archive
1558first into a temprary directory and then read the extracted files.
1559This is especially important for remote archives
1560
1561=item Limit annotations
1562
1563Per default, all supported annotation layers are sought. This can be limited
1564by adding C<--skip '#ALL'> and only listing the expected annotations with C<--anno>.
1565
1566=item Checking the parallel job count
1567
1568By providing the number of parallel jobs using C<--jobs>, the execution can be tailored to specific
1569hardware environments.
1570
Marc Kupietzaeac7532025-04-14 20:00:33 +02001571=item Install ripunzip
1572
1573For full extraction of data, L<ripunzip|https://github.com/google/ripunzip> can be
1574used for improved performance.
1575
1576
Akronc13a1702016-03-15 19:33:14 +01001577=head1 ANNOTATION SUPPORT
1578
1579L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1580developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1581The base foundry with paragraphs, sentences, and the text element are mandatory for
1582L<Krill|https://github.com/KorAP/Krill>.
1583
Akron821db3d2017-04-06 21:19:31 +02001584 Base
1585 #Paragraphs
1586 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001587
Akron821db3d2017-04-06 21:19:31 +02001588 Connexor
1589 #Morpho
1590 #Phrase
1591 #Sentences
1592 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001593
Akron821db3d2017-04-06 21:19:31 +02001594 CoreNLP
1595 #Constituency
1596 #Morpho
1597 #NamedEntities
1598 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001599
Akron5530a552022-02-17 17:53:15 +01001600 CorpusExplorer
1601 #Morpho
1602
Akronce125b62017-06-19 11:54:36 +02001603 CMC
1604 #Morpho
1605
Akron821db3d2017-04-06 21:19:31 +02001606 DeReKo
1607 #Structure
Akronc13a1702016-03-15 19:33:14 +01001608
Akron57510c12019-01-04 14:58:53 +01001609 DGD
1610 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001611 #Structure
Akron57510c12019-01-04 14:58:53 +01001612
Akron821db3d2017-04-06 21:19:31 +02001613 DRuKoLa
1614 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001615
Akron821db3d2017-04-06 21:19:31 +02001616 Glemm
1617 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001618
Akronabb36902021-10-11 15:51:06 +02001619 Gingko
1620 #Morpho
1621
Akronea1aed52018-07-19 14:43:34 +02001622 HNC
1623 #Morpho
1624
Akron4c679192018-01-16 17:41:49 +01001625 LWC
1626 #Dependency
1627
Akron821db3d2017-04-06 21:19:31 +02001628 Malt
1629 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001630
Akron821db3d2017-04-06 21:19:31 +02001631 MarMoT
1632 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001633
Akron821db3d2017-04-06 21:19:31 +02001634 Mate
1635 #Dependency
1636 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001637
Akron821db3d2017-04-06 21:19:31 +02001638 MDParser
1639 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001640
Akrone85a7762022-07-22 08:05:03 +02001641 NKJP
1642 #Morpho
1643 #NamedEntities
1644
Akron821db3d2017-04-06 21:19:31 +02001645 OpenNLP
1646 #Morpho
1647 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001648
Akron07e24772020-04-23 14:00:54 +02001649 RWK
1650 #Morpho
1651 #Structure
1652
Akron821db3d2017-04-06 21:19:31 +02001653 Sgbr
1654 #Lemma
1655 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001656
Marc Kupietzb8c53822024-03-16 18:54:08 +01001657 Spacy
1658 #Morpho
1659
Akron7d5e6382019-08-08 16:36:27 +02001660 Talismane
1661 #Dependency
1662 #Morpho
1663
Akron821db3d2017-04-06 21:19:31 +02001664 TreeTagger
1665 #Morpho
1666 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001667
Akron83aedd32023-02-07 10:57:41 +01001668 UDPipe
1669 #Dependency
1670 #Morpho
1671
Akron821db3d2017-04-06 21:19:31 +02001672 XIP
1673 #Constituency
1674 #Morpho
1675 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001676
Akronc13a1702016-03-15 19:33:14 +01001677
1678More importers are in preparation.
1679New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1680See the built-in annotation importers as examples.
1681
Akronf73ffb62018-06-27 12:13:59 +02001682
Akron41e6c8b2021-10-14 20:22:18 +02001683=head1 METADATA SUPPORT
1684
1685L<KorAP::XML::Krill> has built-in importer for some meta data variants
Akron4b001ce2024-06-06 12:32:11 +02001686that are part of the KorAP preprocessing pipeline.
Akron41e6c8b2021-10-14 20:22:18 +02001687
1688=over 2
1689
Akron1d101492024-06-06 12:47:35 +02001690=item B<I5>
Akron41e6c8b2021-10-14 20:22:18 +02001691
Akron1d101492024-06-06 12:47:35 +02001692Meta data for all I5 files
Akron41e6c8b2021-10-14 20:22:18 +02001693
Akron1d101492024-06-06 12:47:35 +02001694=item B<Sgbr>
Akron41e6c8b2021-10-14 20:22:18 +02001695
Akron1d101492024-06-06 12:47:35 +02001696Meta data from the Schreibgebrauch project
Akron2532f1b2023-05-15 13:41:24 +02001697
Akron1d101492024-06-06 12:47:35 +02001698=item B<Gingko>
1699
1700Meta data from the Gingko project in addition to I5
1701
1702=item B<ICC>
1703
1704Meta data for the ICC in addition to I5
1705
1706=item B<NKJP>
1707
1708Meta data for the NKJP corpora
Akron24ad3c02024-06-03 12:38:20 +02001709
Akron41e6c8b2021-10-14 20:22:18 +02001710=back
1711
Akron41e6c8b2021-10-14 20:22:18 +02001712New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1713See the built-in meta data importers as examples.
1714
Akron4b001ce2024-06-06 12:32:11 +02001715The I5 metadata definition is based on TEI-P5 and supports C<E<lt>xenoDataE<gt>>
Akron82064bb2024-06-17 12:53:23 +02001716with C<E<lt>metaE<gt>> elements like
Akron4b001ce2024-06-06 12:32:11 +02001717
1718 <meta type="..." name="..." project="..." desc="...">...</meta>
1719
1720that are directly translated to Krill objects. The supported values are:
1721
1722=over 2
1723
Akron1d101492024-06-06 12:47:35 +02001724=item C<type>
Akron4b001ce2024-06-06 12:32:11 +02001725
1726=over 4
1727
Akron1d101492024-06-06 12:47:35 +02001728=item C<string>
Akron4b001ce2024-06-06 12:32:11 +02001729
Akron1d101492024-06-06 12:47:35 +02001730String meta data value
Akron4b001ce2024-06-06 12:32:11 +02001731
Akron1d101492024-06-06 12:47:35 +02001732=item C<keyword>
Akron4b001ce2024-06-06 12:32:11 +02001733
Akrondee3cf62024-06-14 18:14:48 +02001734String meta data value that can be given multiple times
Akron4b001ce2024-06-06 12:32:11 +02001735
Akron1d101492024-06-06 12:47:35 +02001736=item C<text>
Akron4b001ce2024-06-06 12:32:11 +02001737
Akrondee3cf62024-06-14 18:14:48 +02001738String meta data value that is tokenized and can be searched as token sequences
Akron4b001ce2024-06-06 12:32:11 +02001739
Akron1d101492024-06-06 12:47:35 +02001740=item C<date>
1741
1742Date meta data value (as "yyyy/mm/dd" with optional granularity)
1743
1744=item C<integer>
1745
1746Numerical meta data value
1747
Akrondee3cf62024-06-14 18:14:48 +02001748=item C<attachment>
Akron1d101492024-06-06 12:47:35 +02001749
1750Non-indexed meta data value (only retrievable)
1751
1752=item C<uri>
1753
1754Non-indexed attached URI, takes the desc as the title for links
Akron4b001ce2024-06-06 12:32:11 +02001755
1756=back
1757
Akron1d101492024-06-06 12:47:35 +02001758=item C<name>
Akron4b001ce2024-06-06 12:32:11 +02001759
Akrondee3cf62024-06-14 18:14:48 +02001760The key of the meta object that may be prefixed by C<corpus> or C<doc>, in case the
Akron693f5882024-06-06 12:52:39 +02001761C<E<lt>xenoDataE<gt>> information is located on these levels. The text level introduces
1762no prefixes.
Akron4b001ce2024-06-06 12:32:11 +02001763
Akron1d101492024-06-06 12:47:35 +02001764=item C<project> (optional)
Akron4b001ce2024-06-06 12:32:11 +02001765
Akron1d101492024-06-06 12:47:35 +02001766A prefixed namespace of the key
1767
1768=item C<desc> (optional)
1769
1770A description of the key
1771
1772=item text content
1773
1774The value of the meta object
Akron4b001ce2024-06-06 12:32:11 +02001775
1776=back
1777
Akron41e6c8b2021-10-14 20:22:18 +02001778
Akron8f69d632020-01-15 16:58:11 +01001779=head1 About KorAP-XML
1780
1781KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1782data model (Bański et al. 2013), where text data are stored physically
1783separated from their interpretations (i.e. annotations).
1784A text document in KorAP-XML therefore consists of several files
1785containing primary data, metadata and annotations.
1786
1787The structure of a single KorAP-XML document can be as follows:
1788
1789 - data.xml
1790 - header.xml
1791 + base
1792 - tokens.xml
1793 - ...
1794 + struct
1795 - structure.xml
1796 - ...
1797 + corenlp
1798 - morpho.xml
1799 - constituency.xml
1800 - ...
1801 + tree_tagger
1802 - morpho.xml
1803 - ...
1804 - ...
1805
1806The C<data.xml> contains the primary data, the C<header.xml> contains
1807the metadata, and the annotation layers are stored in subfolders
1808like C<base>, C<struct> or C<corenlp>
1809(so-called "foundries"; Bański et al. 2013).
1810
1811Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001812(Lüngen and Sperberg-McQueen 2012). See the documentation in
1813L<KorAP::XML::Meta::I5> for translatable fields.
1814
1815Annotations correspond to a variant of the TEI-P5 feature structures
1816(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001817Annotation feature structures refer to character sequences of the primary text
1818inside the C<text> element of the C<data.xml>.
1819A single annotation containing the lemma of a token can have the following structure:
1820
1821 <span from="0" to="3">
1822 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1823 <f name="lex">
1824 <fs>
1825 <f name="lemma">zum</f>
1826 </fs>
1827 </f>
1828 </fs>
1829 </span>
1830
1831The C<from> and C<to> attributes are refering to the character span
1832in the primary text.
1833Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1834the structure may vary. See L<KorAP::XML::Annotation::*> for various
1835annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001836
1837Multiple KorAP-XML documents are organized on three levels following
1838the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1839corpus E<gt> document E<gt> text. On each level metadata information
1840can be stored, that C<korapxml2krill> will merge to a single metadata
1841object per text. A corpus is therefore structured as follows:
1842
1843 + <corpus>
1844 - header.xml
1845 + <document>
1846 - header.xml
1847 + <text>
1848 - data.xml
1849 - header.xml
1850 - ...
1851 - ...
1852
1853A single text can be identified by the concatenation of
1854the corpus identifier, the document identifier and the text identifier.
1855This identifier is called the text sigle
1856(e.g. a text with the identifier C<18486> in the document C<060> in the
1857corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1858
1859These corpora are often stored in zip files, with which C<korapxml2krill>
1860can deal with. Corpora may also be split in multiple zip archives
1861(e.g. one zip file per foundry), which is also supported (see C<--input>).
1862
1863Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1864in form of a test suite.
1865The resulting JSON format merges all annotation layers
1866based on a single token stream.
1867
1868=head2 References
1869
1870Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1871KorAP data model: first approximation, December.
1872
1873Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1874"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1875Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1876L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1877
1878Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1879"Robust corpus architecture: a new look at virtual collections and data access",
1880Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1881L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1882
1883Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1884Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1885"Towards an international standard on featurestructure representation",
1886Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1887pp. 373-376.
1888L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1889
1890Harald Lüngen and C. M. Sperberg-McQueen (2012):
1891"A TEI P5 Document Grammar for the IDS Text Model",
1892Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1893L<PDF|https://journals.openedition.org/jtei/pdf/508>
1894
1895TEI Consortium, eds:
1896"Feature Structures",
1897Guidelines for Electronic Text Encoding and Interchange.
1898L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1899
Akron941c1a62016-02-23 17:41:41 +01001900=head1 AVAILABILITY
1901
1902 https://github.com/KorAP/KorAP-XML-Krill
1903
1904
1905=head1 COPYRIGHT AND LICENSE
1906
Akrona3518372024-01-22 23:29:00 +01001907Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001908
Akron6882d7d2021-02-08 09:43:57 +01001909Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001910
Akron29128262024-04-17 15:50:36 +02001911Contributor: Eliza Margaretha, Marc Kupietz
Akron941c1a62016-02-23 17:41:41 +01001912
Akron6882d7d2021-02-08 09:43:57 +01001913L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001914Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001915L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001916member of the
Akronf1849aa2019-12-16 23:35:33 +01001917L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001918
1919This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001920L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001921
1922=cut