blob: 487368f71a9cf8e27bdaa023b11cf1a413120ab1 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron7d4d2d72024-09-05 11:05:35 +02004use v5.32;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010018use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010019use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020020use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020021use Config::Simple;
Akron0a0d1f92024-11-14 14:31:42 +010022use Path::Iterator::Rule;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akroncb12af72025-07-15 14:36:10 +020025use File::Temp qw/tempdir tempfile/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Marc Kupietzb8c53822024-03-16 18:54:08 +0100174# 2024/03/20
175# - Added Spacy support.
176#
Akronebbac2e2024-03-22 10:31:23 +0100177# 2024/03/22
178# - Improve core count logging.
Akron941c1a62016-02-23 17:41:41 +0100179# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100180
Akroncb12af72025-07-15 14:36:10 +0200181our $LAST_CHANGE = '2025/07/15';
Akron941c1a62016-02-23 17:41:41 +0100182our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100183our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100184our $VERSION_MSG = <<"VERSION";
185Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
186VERSION
187
Akron941c1a62016-02-23 17:41:41 +0100188# Parse comand
189my $cmd;
190our @ARGV;
191if ($ARGV[0] && index($ARGV[0], '-') != 0) {
192 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100193};
Akron63f20d42017-04-10 23:40:29 +0200194my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100195
Akron5f51d422016-08-16 16:26:43 +0200196my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200197
198# Configuration hash
199my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100200
Akronebbac2e2024-03-22 10:31:23 +0100201# Count jobs/cores if not set
202sub count_jobs {
203 my ($cores, $jobs);
204 my $msg = 'Unable to determine number of cores - set to 1';
205 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
206 $cores = Sys::Info->new->device('CPU')->count;
207 if ($cores <= 0) {
208 $log->error($msg);
209 $cores = 1;
210 }
211 }
212 else {
213 $log->error($msg);
214 $cores = 1;
215 };
216
217 $jobs = ceil(5 * $cores);
218 return $jobs, "Run using $jobs jobs on $cores cores";
219}
220
Akron941c1a62016-02-23 17:41:41 +0100221# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000222GetOptions(
Akron08385f62016-03-22 20:37:04 +0100223 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200224 'input-base|ib=s' => \($cfg{input_base}),
225 'output|o=s' => \($cfg{output}),
226 'overwrite|w' => \($cfg{overwrite}),
227 'meta|m=s' => \($cfg{meta}),
228 'token|t=s' => \($cfg{token}),
229 'base-sentences|bs=s' => \($cfg{base_sentences}),
230 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
231 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
232 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100233 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100234 'skip|s=s' => \@skip,
235 'sigle|sg=s' => \@sigle,
Akronc0ac4ff2024-04-15 18:03:15 +0200236 'cache|c=s' => \($cfg{cache}),
Akron636aa112017-04-07 18:48:56 +0200237 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200238 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200239 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200240 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200241 'primary|p!' => sub {
242 warn 'Primary flag no longer supported!';
243 },
Akrona3518372024-01-22 23:29:00 +0100244 'quiet' => \($cfg{quiet}),
Akron6aed0562020-08-07 16:46:00 +0200245 'pretty|y' => sub {
246 warn 'Pretty flag no longer supported!';
247 },
Akronf8df2162020-08-07 15:03:39 +0200248 'jobs|j=i' => \($cfg{jobs}),
249 'koral|k=f' => \($cfg{koral}),
250 'to-tar' => \($cfg{to_tar}),
251 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
252 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
253 'sequential-extraction|se' => \($cfg{sequential_extraction}),
254 'cache-size|cs=s' => \($cfg{cache_size}),
255 'cache-delete|cd!' => \($cfg{cache_delete}),
256 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100257 'help|h' => sub {
258 pod2usage(
259 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200260 -verbose => 99,
261 -msg => $VERSION_MSG,
262 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100263 );
264 },
265 'version|v' => sub {
266 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200267 -verbose => 0,
268 -msg => $VERSION_MSG,
269 -output => '-'
Akronebbac2e2024-03-22 10:31:23 +0100270 ),
271 },
272 'job-count|jc' => sub {
273 my ($j, $msg) = count_jobs();
274 pod2usage(
275 -verbose => 0,
276 -msg => $msg,
277 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100278 )
279 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000280);
281
Akrone512b7c2020-08-07 16:16:12 +0200282my %ERROR_HASH = (
283 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
284 -verbose => 99,
285 -msg => $VERSION_MSG,
286 -output => '-',
287 -exit => 1
288);
Akron63f20d42017-04-10 23:40:29 +0200289
Akronf8df2162020-08-07 15:03:39 +0200290# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200291if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200292 my %config;
293
Akronf8df2162020-08-07 15:03:39 +0200294 print "Reading config from $cfg_file\n";
295
Akron636aa112017-04-07 18:48:56 +0200296 Config::Simple->import_from($cfg_file, \%config);
297
Akronf8df2162020-08-07 15:03:39 +0200298 foreach (qw!output cache-size input-base token overwrite
299 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200300 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100301 non-verbal-tokens sequential-extraction
Akronc0ac4ff2024-04-15 18:03:15 +0200302 temporary-extract cache-init cache-delete
Akrona3518372024-01-22 23:29:00 +0100303 koral extract-dir jobs quiet!) {
Akronf8df2162020-08-07 15:03:39 +0200304 my $underlined = $_ =~ tr/-/_/r;
305 if (!defined($cfg{$underlined}) && defined $config{$_}) {
306 $cfg{$underlined} = $config{$_};
307 };
Akron636aa112017-04-07 18:48:56 +0200308 };
309
310 # Skip
311 if (!scalar(@skip) && defined $config{'skip'}) {
312 @skip = split /\s*;\s*/, $config{'skip'} ;
313 };
314
315 # Sigle
316 if (!scalar(@sigle) && defined $config{'sigle'}) {
317 @sigle = split /\s*;\s*/, $config{'sigle'} ;
318 };
319
320 # Anno
321 if (!scalar(@anno) && defined $config{'anno'}) {
322 @anno = split /\s*;\s*/, $config{'anno'} ;
323 };
324};
325
Akronf8df2162020-08-07 15:03:39 +0200326# Init variables and set default values
327my $output = $cfg{output};
328my $input_base = $cfg{input_base};
329my $gzip = $cfg{gzip};
330my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100331my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200332my $token_base = $cfg{token} // 'OpenNLP#tokens';
333my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
334my $jobs = $cfg{jobs} // 0;
335my $cache_delete = $cfg{cache_delete} // 1;
336my $base_sentences = lc($cfg{base_sentences} // '');
337my $base_paragraphs = lc($cfg{base_paragraphs} // '');
338my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
339my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akrona3518372024-01-22 23:29:00 +0100340my $q = !!($cfg{quiet}) // 0;
Akron63f20d42017-04-10 23:40:29 +0200341
Akronf8df2162020-08-07 15:03:39 +0200342# Get tokenization basis
343my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200344
Akronf8df2162020-08-07 15:03:39 +0200345# Remove file extension
346$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100347
Akronf8df2162020-08-07 15:03:39 +0200348# Convert sigle to path construct
349s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
350
351my %skip;
352$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200353
Akronb9c33812020-10-21 16:19:35 +0200354Log::Any::Adapter->set(
355 'Stderr', log_level => uc($cfg{log} // 'ERROR')
356);
Akron63f20d42017-04-10 23:40:29 +0200357
Akron84b53ad2022-01-14 12:39:15 +0100358# Start log slimming
359if ($cmd && $cmd eq 'slimlog') {
360 require KorAP::XML::Log::Slim;
361
362 my $log_file = shift @ARGV;
363
364 if (-e $log_file) {
365
366 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
367
368 # Run log filter
369 $slimmer->slim_to;
370 }
371
372 else {
373 warn "Log file can't be found";
374 exit(1);
375 };
376
377 exit;
378};
379
380
Akronf8df2162020-08-07 15:03:39 +0200381if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
382 $log->error("Directory '$output' does not exist.");
383 exit 1;
384};
Akron63f20d42017-04-10 23:40:29 +0200385
Akron941c1a62016-02-23 17:41:41 +0100386# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100387pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000388
Akrone1dbc382016-07-08 22:24:52 +0200389# Gzip has no effect, if no output is given
390pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000391
Akron63f20d42017-04-10 23:40:29 +0200392# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200393if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200394
Akron63f20d42017-04-10 23:40:29 +0200395 # Remove all inputs
396 my $remove_next = 0;
397 @keep_argv = @{c(@keep_argv)->grep(
398 sub {
399 # Input flag
400 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
401 $remove_next = 1;
402 return 0;
403 }
404
405 # input value
406 elsif ($remove_next) {
407 $remove_next = 0;
408 return 0;
409 };
410
411 # Pass parameter
412 return 1;
413 }
414 )->to_array};
415
416
417 # Iterate over all inputs
418 foreach (@input) {
419
Akron081639e2017-04-21 19:01:39 +0200420 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200421 my $new_out = catdir($output, get_file_name_from_glob($_));
422
Akron486f9ab2017-04-22 23:25:19 +0200423 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200424 unless ($to_tar) {
425 if (make_path($new_out) == 0 && !-d $new_out) {
426 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200427 exit 1;
Akron081639e2017-04-21 19:01:39 +0200428 };
Akron63f20d42017-04-10 23:40:29 +0200429 };
430
431 # Create archive command
432 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
Akrona3518372024-01-22 23:29:00 +0100433 unless ($q) {
434 print "Start serial processing of $_ to $new_out\n";
435 print 'Command: ', join(' ', @archive_cmd), "\n";
436 };
Akron63f20d42017-04-10 23:40:29 +0200437
438 # Start archiving
439 system @archive_cmd;
440 };
441
Akron3abc03e2017-06-29 16:23:35 +0200442 exit;
Akron63f20d42017-04-10 23:40:29 +0200443};
444
Akron5c602cb2020-08-07 17:00:52 +0200445# Define supported (and preinstalled) transformation modules
446my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100447push(@layers, ['Base', 'Sentences']) unless $base_sentences;
448push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200449
450# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200451push(@layers, ['Connexor', 'Morpho'],
452 ['Connexor', 'Syntax'],
453 ['Connexor', 'Phrase'],
454 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200455
456# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200457push(@layers,
458 ['CoreNLP', 'NamedEntities'],
459 ['CoreNLP', 'Sentences'],
460 ['CoreNLP', 'Morpho'],
461 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200462
Akron5530a552022-02-17 17:53:15 +0100463# CorpusExplorer
464push(@layers,
465 ['CorpusExplorer', 'Morpho']);
466
Akronce125b62017-06-19 11:54:36 +0200467# CMC
468push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100469
Akrone1dbc382016-07-08 22:24:52 +0200470# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100471my @dereko_attr = ();
472if ($base_sentences eq 'dereko#structure') {
473 push @dereko_attr, 'sentences';
474};
475if ($base_paragraphs eq 'dereko#structure') {
476 push @dereko_attr, 'paragraphs';
477};
Akron636bd9c2017-02-09 17:13:00 +0100478
Akron41ac10b2017-02-08 22:47:25 +0100479if ($base_pagebreaks eq 'dereko#structure') {
480 push @dereko_attr, 'pagebreaks';
481};
482
483if ($dereko_attr[0]) {
484 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100485}
486else {
487 push(@layers, ['DeReKo', 'Structure']);
488};
Akrone1dbc382016-07-08 22:24:52 +0200489
Akron57510c12019-01-04 14:58:53 +0100490# DGD
491push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100492if ($base_sentences eq 'dgd#structure') {
493 push(@layers, ['DGD', 'Structure', 'base-sentence']);
494}
Akron57510c12019-01-04 14:58:53 +0100495
496# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200497push(@layers,
498 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100499
Akronabb36902021-10-11 15:51:06 +0200500# Gingko
501push(@layers,
502 ['Gingko', 'Morpho']);
503
Akrone1dbc382016-07-08 22:24:52 +0200504# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200505push(@layers,
506 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200507
Akronea1aed52018-07-19 14:43:34 +0200508# HNC
Akron5c602cb2020-08-07 17:00:52 +0200509push(@layers,
510 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200511
Akron4c679192018-01-16 17:41:49 +0100512# LWC
Akron5c602cb2020-08-07 17:00:52 +0200513push(@layers,
514 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100515
Akrone1dbc382016-07-08 22:24:52 +0200516# Malt
Akron5c602cb2020-08-07 17:00:52 +0200517push(@layers,
518 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200519
Akron57510c12019-01-04 14:58:53 +0100520# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200521push(@layers,
522 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200523
524# Mate
Akron5c602cb2020-08-07 17:00:52 +0200525push(@layers,
526 ['Mate', 'Morpho'],
527 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200528
Akron57510c12019-01-04 14:58:53 +0100529# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200530push(@layers,
531 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100532
Akron88d063a2022-03-21 15:10:01 +0100533# NKJP
534push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200535 ['NKJP', 'Morpho'],
536 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100537
Akrone1dbc382016-07-08 22:24:52 +0200538# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200539push(@layers,
540 ['OpenNLP', 'Morpho'],
541 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200542
Akron07e24772020-04-23 14:00:54 +0200543# Redewiedergabe
544push(@layers, ['RWK', 'Morpho']);
545if ($base_sentences eq 'rwk#structure') {
546 push(@layers, ['RWK', 'Structure']);
547};
548
Akrone1dbc382016-07-08 22:24:52 +0200549# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200550push(@layers,
551 ['Sgbr', 'Lemma'],
552 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200553
Marc Kupietzb8c53822024-03-16 18:54:08 +0100554# Spacy
555push(@layers,
556 ['Spacy', 'Morpho']);
557
Akron7d5e6382019-08-08 16:36:27 +0200558# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200559push(@layers,
560 ['Talismane', 'Dependency'],
561 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200562
Akrone1dbc382016-07-08 22:24:52 +0200563# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200564push(@layers,
565 ['TreeTagger', 'Morpho'],
566 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200567
Marc Kupietz400590b2022-12-23 16:02:36 +0100568# UDPipe
569push(@layers,
570 ['UDPipe', 'Morpho'],
571 ['UDPipe', 'Dependency']);
572
Akrone1dbc382016-07-08 22:24:52 +0200573# XIP
Akron5c602cb2020-08-07 17:00:52 +0200574push(@layers,
575 ['XIP', 'Morpho'],
576 ['XIP', 'Constituency'],
577 ['XIP', 'Sentences'],
578 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200579
Akron4fa37c32017-01-20 14:43:10 +0100580
Akrone1dbc382016-07-08 22:24:52 +0200581# Check filters
582my @filtered_anno;
583if ($skip{'#all'}) {
584 foreach (@anno) {
585 push @filtered_anno, [ split('#', $_) ];
586 };
587}
588
589# Add all annotations that are not skipped
590else {
591 # Add to index file - respect skipping
592 foreach my $info (@layers) {
593 # Skip if Foundry or Foundry#Layer should be skipped
594 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
595 push @filtered_anno, $info;
596 };
597 };
598};
599
Akrone1dbc382016-07-08 22:24:52 +0200600
601# TODO: This should not be initialized for batch
602my $cache = Cache::FastMmap->new(
603 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200604 cache_size => ($cfg{cache_size} // '50m'),
Akronc0ac4ff2024-04-15 18:03:15 +0200605 init_file => ($cfg{cache_init} // 1),
606 unlink_on_exit => $cache_delete
Akrone1dbc382016-07-08 22:24:52 +0200607);
608
Akron03b24db2016-08-16 20:54:32 +0200609# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200610my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200611 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200612 meta_type => $cfg{meta},
613 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200614 foundry => $token_base_foundry,
615 layer => $token_base_layer,
616 gzip => $gzip,
617 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200618 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100619 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200620 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200621 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
622 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200623);
624
Akrone512b7c2020-08-07 16:16:12 +0200625# Auto adjust jobs
626if ($jobs eq '-1') {
Akronebbac2e2024-03-22 10:31:23 +0100627 ($jobs, my $msg) = count_jobs();
628 print $msg . "\n" unless $q;
Akrone512b7c2020-08-07 16:16:12 +0200629};
630
Akron63f20d42017-04-10 23:40:29 +0200631# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200632if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200633
Akron821db3d2017-04-06 21:19:31 +0200634 my @new_input = ();
635
636 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200637 foreach my $wild_card (@input) {
638
639 # Prefix with input root
640 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
641
642 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200643 };
644
Akron63f20d42017-04-10 23:40:29 +0200645 # Sort files by length
646 @input = sort { length($a) <=> length($b) } @new_input;
647
Akrona3518372024-01-22 23:29:00 +0100648 print 'Input is ' . join(', ', @input)."\n" unless $q;
Akron821db3d2017-04-06 21:19:31 +0200649};
650
Akron941c1a62016-02-23 17:41:41 +0100651# Process a single file
652unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100653 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000654
Akron941c1a62016-02-23 17:41:41 +0100655 BEGIN {
656 $main::TIME = Benchmark->new;
657 $main::LAST_STOP = Benchmark->new;
658 };
659
660 sub stop_time {
661 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200662 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100663 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200664 timestr(timediff($new, $main::LAST_STOP)) .
665 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
666 );
Akron941c1a62016-02-23 17:41:41 +0100667 $main::LAST_STOP = $new;
668 };
669
670 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200671 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100672
Akron7d4cdd82016-08-17 21:39:45 +0200673 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200674 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100675
Akron5f51d422016-08-16 16:26:43 +0200676 stop_time;
Akronc0ac4ff2024-04-15 18:03:15 +0200677
Akron3abc03e2017-06-29 16:23:35 +0200678 exit;
Akron81500102017-04-07 20:45:44 +0200679};
680
Nils Diewald59094f22014-11-05 18:20:50 +0000681
Akrone10ad322016-02-27 10:54:26 +0100682# Extract XML files
Akron81500102017-04-07 20:45:44 +0200683if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100684
Akrond5643ad2017-07-04 20:27:13 +0200685 # Output is required
686 pod2usage(%ERROR_HASH) unless $output;
687
Akron7d4cdd82016-08-17 21:39:45 +0200688 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200689 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100690
Akron7d4cdd82016-08-17 21:39:45 +0200691 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100692 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200693 $log->error("Unzip is not installed or incompatible.");
694 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100695 };
696
Akronb0c88db2016-06-29 16:33:18 +0200697 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200698 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200699
Akron31a08cb2019-02-20 20:43:26 +0100700 # Will set @sigle
701 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200702
Akrone10ad322016-02-27 10:54:26 +0100703 # Iterate over all given sigles and extract
704 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100705
Akrona3518372024-01-22 23:29:00 +0100706 unless ($q) {
707 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200708
Akrona3518372024-01-22 23:29:00 +0100709 # TODO: Make this OS independent
710 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100711
Akrona3518372024-01-22 23:29:00 +0100712 # TODO:
713 # - prefix???
714 $archive->extract_sigle(0, [$_], $output, $jobs)
715 ? '' : 'not '
716 );
717 print "extracted.\n";
718 } else {
Akroncb12af72025-07-15 14:36:10 +0200719 $archive->extract_sigle($q, [$_], $output, $jobs);
Akrona3518372024-01-22 23:29:00 +0100720 }
Akrone10ad322016-02-27 10:54:26 +0100721 };
Akronb0c88db2016-06-29 16:33:18 +0200722 }
Akron7d4cdd82016-08-17 21:39:45 +0200723
724 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200725 else {
726 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200727 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100728 };
729}
730
Akron81500102017-04-07 20:45:44 +0200731
Akron941c1a62016-02-23 17:41:41 +0100732# Process an archive
733elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000734
Akron81500102017-04-07 20:45:44 +0200735 my $archive_output;
736
737 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100738 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200739
740 # Create new archive object
741 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
742
743 # Check zip capabilities
744 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200745 $log->error("Unzip is not installed or incompatible.");
746 exit 1;
Akron81500102017-04-07 20:45:44 +0200747 };
748
749 # Add further annotation archived
750 $archive->attach($_) foreach @input[1..$#input];
751
752 # Create a temporary directory
753 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200754 $extract_dir = tempdir(CLEANUP => 0);
Akrona3518372024-01-22 23:29:00 +0100755 print "Temporarily extract to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200756 };
757
Akron63f20d42017-04-10 23:40:29 +0200758 # Add some random extra to avoid clashes with multiple archives
759 $extract_dir = catdir($extract_dir, random_string('cccccc'));
760
Akron31a08cb2019-02-20 20:43:26 +0100761 # Extract to temporary directory
Akrona3518372024-01-22 23:29:00 +0100762 if ($archive->extract_all($q, $extract_dir, $sequential_extraction ? 1: $jobs)) {
763 print "Extract sequentially to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200764 @input = ($extract_dir);
765 }
766 else {
767 $log->error('Unable to extract from primary archive ' . $input[0] .
768 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200769 exit 1;
Akron81500102017-04-07 20:45:44 +0200770 };
771 }
772
773 # Can't create archive object
774 else {
775 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200776 exit 1;
Akron81500102017-04-07 20:45:44 +0200777 };
778 };
779
Akron7d4cdd82016-08-17 21:39:45 +0200780 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100781 my $pool = Parallel::ForkManager->new($jobs);
782
Akron7d4cdd82016-08-17 21:39:45 +0200783 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100784 my $iter = 1; # Current text in process
785
Akronda3097e2017-04-23 19:53:57 +0200786 my $tar_archive;
787 my $output_dir = $output;
788 my $tar_fh;
Akroncb12af72025-07-15 14:36:10 +0200789 my $final_tar_file;
790 my %tar_pool;
791 my $next_tar = 1; # Counter for tar assignment
Akronda3097e2017-04-23 19:53:57 +0200792
793 # Initialize tar archive
794 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200795 # Set output name
Akroncb12af72025-07-15 14:36:10 +0200796 $final_tar_file = $output;
797 unless ($final_tar_file =~ /\.tar$/) {
798 $final_tar_file .= '.tar';
Akronda3097e2017-04-23 19:53:57 +0200799 };
800
Akroncb12af72025-07-15 14:36:10 +0200801 print "Writing to file $final_tar_file\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200802
Akroncb12af72025-07-15 14:36:10 +0200803 # Create tar pool with size equal to number of jobs
804 # If jobs is 0, create just one tar file
805 my $pool_size = $jobs > 0 ? $jobs : 1;
806 for my $i (1..$pool_size) {
807 my ($fh, $temp_tar) = tempfile(
808 "korapxml2krill_pool_${i}_XXXX",
809 SUFFIX => '.tar',
810 TMPDIR => 1
Akroneb370a02022-02-24 13:33:40 +0100811 );
812
Akroncb12af72025-07-15 14:36:10 +0200813 $tar_pool{$i} = {
814 fh => $fh,
815 file => $temp_tar,
816 };
Akroneb370a02022-02-24 13:33:40 +0100817
Akroncb12af72025-07-15 14:36:10 +0200818 if (eval("use Archive::Tar::Builder; 1;")) {
819 ($tar_pool{$i}->{archive} = Archive::Tar::Builder->new(ignore_errors => 1))->set_handle($fh);
820 } else {
821 $tar_pool{$i}->{archive} = KorAP::XML::TarBuilder->new($fh);
822 }
Akroneb370a02022-02-24 13:33:40 +0100823 };
Akronda3097e2017-04-23 19:53:57 +0200824
825 # Output to temporary directory
826 $output_dir = File::Temp->newdir;
827 };
828
Akron941c1a62016-02-23 17:41:41 +0100829 # Report on fork message
830 $pool->run_on_finish (
831 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200832 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100833 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200834
Akrona3518372024-01-22 23:29:00 +0100835 unless ($q) {
836 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
837 $iter . "/$count]" .
838 ($code ? " $code" : '') .
839 ' ' . $data->[0] . "\n";
840 };
841 $iter++;
Akronda3097e2017-04-23 19:53:57 +0200842
843 if (!$code && $to_tar && $data->[2]) {
844 my $filename = $data->[2];
Akroncb12af72025-07-15 14:36:10 +0200845 my $clean_file = fileparse($filename);
Akronda3097e2017-04-23 19:53:57 +0200846
Akroncb12af72025-07-15 14:36:10 +0200847 # Get next available tar file in round-robin fashion
848 my $pool_size = $jobs > 0 ? $jobs : 1;
849 my $pool_idx = $next_tar;
850 $next_tar = ($next_tar % $pool_size) + 1;
Akronda3097e2017-04-23 19:53:57 +0200851
Akroncb12af72025-07-15 14:36:10 +0200852 my $tar = $tar_pool{$pool_idx};
Akron9a062ce2017-07-04 19:12:05 +0200853
Akroncb12af72025-07-15 14:36:10 +0200854 # Lock the tar file before writing
855 flock($tar->{fh}, LOCK_EX);
Akronda3097e2017-04-23 19:53:57 +0200856
Akroncb12af72025-07-15 14:36:10 +0200857 # Add file to pool tar
858 $tar->{archive}->archive_as($filename => $clean_file);
859
860 # Release lock
861 flock($tar->{fh}, LOCK_UN);
862
863 unlink $filename;
Akronda3097e2017-04-23 19:53:57 +0200864 };
865
Akron4c0cf312016-10-15 16:42:09 +0200866 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100867 }
868 );
869
870 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200871 my $temp;
Akrona3518372024-01-22 23:29:00 +0100872 print "Reading data ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100873
Akron7d4cdd82016-08-17 21:39:45 +0200874 # unless (Cache::FastMmap->new(
875 # share_file => $cache_file,
876 # cache_size => $cache_size,
877 # init_file => $cache_init
878 # )) {
879 # print "Unable to intialize cache '$cache_file'\n\n";
880 # exit(1);
881 # };
Akron11c80302016-03-18 19:44:43 +0100882
Akron486f9ab2017-04-22 23:25:19 +0200883
Akron941c1a62016-02-23 17:41:41 +0100884 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100885 if (-d $input[0]) {
Akron941c1a62016-02-23 17:41:41 +0100886
Akronce033502024-09-11 10:51:49 +0200887 my @dirs;
888
Akron0a0d1f92024-11-14 14:31:42 +0100889 my $rule = Path::Iterator::Rule->new;
890 $rule->name('data.xml')->file;
891 my $next = $rule->iter(
892 $input[0] => {
893 sorted => 0,
894 depthfirst => -1,
895 error_handler => undef
896 });
897 while (defined(my $file = $next->())) {
898 $file =~ s/\/data\.xml$//;
899 push @dirs, $file;
900 };
Akron941c1a62016-02-23 17:41:41 +0100901
Akrona3518372024-01-22 23:29:00 +0100902 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100903 $t = Benchmark->new;
904 $count = scalar @dirs;
905
906 DIRECTORY_LOOP:
907 for (my $i = 0; $i < $count; $i++) {
908
Akrone1dbc382016-07-08 22:24:52 +0200909 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200910 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200911 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200912 );
Akron941c1a62016-02-23 17:41:41 +0100913
914 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200915 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200916
Akron13d56622016-10-31 14:54:49 +0100917 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200918 $pool->finish(
919 0,
Akronda3097e2017-04-23 19:53:57 +0200920 [
921 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
922 undef,
923 $filename
924 ]
Akron486f9ab2017-04-22 23:25:19 +0200925 );
Akron3ec48972016-08-17 23:24:52 +0200926 }
927 else {
Akron4c0cf312016-10-15 16:42:09 +0200928 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200929 };
Akron941c1a62016-02-23 17:41:41 +0100930 };
931 }
932
933 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200934 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200935
Akron941c1a62016-02-23 17:41:41 +0100936 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200937 $log->error("Unzip is not installed or incompatible.");
938 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100939 };
940
Akron08385f62016-03-22 20:37:04 +0100941 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200942 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100943
Akron31a08cb2019-02-20 20:43:26 +0100944 # Get sigles to extract
945 my $prefix = set_sigle($archive);
946
Akrona3518372024-01-22 23:29:00 +0100947 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100948 $t = Benchmark->new;
949 my @dirs = $archive->list_texts;
950 $count = scalar @dirs;
951
952 ARCHIVE_LOOP:
953 for (my $i = 0; $i < $count; $i++) {
954
955 # Split path information
956 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
957
Akrone1dbc382016-07-08 22:24:52 +0200958 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200959 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200960 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200961 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200962 catfile($corpus, $doc, $text)
963 . '.json' . ($gzip ? '.gz' : '')
964 )
Akrone1dbc382016-07-08 22:24:52 +0200965 );
Akron941c1a62016-02-23 17:41:41 +0100966
967 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200968 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100969
Akron4c0cf312016-10-15 16:42:09 +0200970 # Create temporary file
971 $temp = File::Temp->newdir;
972
Akronbdf434a2016-10-24 17:42:07 +0200973 # TODO: Check if $filename exist at the beginning,
974 # because extraction can be horrible slow!
975
Akron941c1a62016-02-23 17:41:41 +0100976 # Extract from archive
Akrona3518372024-01-22 23:29:00 +0100977 if ($archive->extract_sigle($q, [join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100978
Akron7d4cdd82016-08-17 21:39:45 +0200979 # Create corpus directory
980 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100981
Akron7d4cdd82016-08-17 21:39:45 +0200982 # Temporary directory
983 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100984
Akron7d4cdd82016-08-17 21:39:45 +0200985 # Write file
Akron13d56622016-10-31 14:54:49 +0100986 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200987
Akron4c0cf312016-10-15 16:42:09 +0200988 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100989 $pool->finish(
990 0,
Akronda3097e2017-04-23 19:53:57 +0200991 [
992 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
993 $temp,
994 $filename
995 ]
Akron13d56622016-10-31 14:54:49 +0100996 );
Akron7d4cdd82016-08-17 21:39:45 +0200997 }
998 else {
Akron4c0cf312016-10-15 16:42:09 +0200999 # Delete temporary file
1000 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001001 };
Akron941c1a62016-02-23 17:41:41 +01001002 }
Akron7d4cdd82016-08-17 21:39:45 +02001003
1004 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001005 else {
Akron4c0cf312016-10-15 16:42:09 +02001006 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001007 };
1008 };
1009 }
1010
1011 else {
Akrona3518372024-01-22 23:29:00 +01001012 print "Input is neither a directory nor an archive.\n\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +01001013 };
1014
1015 $pool->wait_all_children;
1016
Akroncb12af72025-07-15 14:36:10 +02001017 # Merge all temporary tar files into final tar if needed
1018 if ($to_tar && %tar_pool) {
1019 $| = 1;
1020 print "Merging " . scalar(keys %tar_pool) . " temporary tar files...\n" unless $q;
1021
1022 # Open final tar file
1023 my $final_fh = IO::File->new($final_tar_file, 'w') or die "Cannot open $final_tar_file: $!";
1024 $final_fh->binmode(1);
1025
1026 # Create final archive
1027 my $final_archive;
1028
1029 if (eval("use Archive::Tar::Builder; 1;")) {
1030 $final_archive = Archive::Tar::Builder->new(ignore_errors => 1);
1031 $final_archive->set_handle($final_fh);
1032 } else {
1033 $final_archive = KorAP::XML::TarBuilder->new($final_fh);
1034 }
1035
1036 # Finish and close all pool tar files
1037 foreach my $pool_idx (sort keys %tar_pool) {
1038 my $tar = $tar_pool{$pool_idx};
1039 $tar->{archive}->finish;
1040 $tar->{fh}->close;
1041
1042 # Append temp tar content to final tar using efficient buffered copy
1043 open my $temp_fh, '<:raw', $tar->{file} or die "Cannot open temp tar $tar->{file}: $!";
1044 my $buffer_size = 1024 * 1024; # 1MB buffer
1045 my $buffer;
1046 while (my $bytes_read = read($temp_fh, $buffer, $buffer_size)) {
1047 my $bytes_written = 0;
1048 while ($bytes_written < $bytes_read) {
1049 my $written = syswrite($final_fh, $buffer, $bytes_read - $bytes_written, $bytes_written);
1050 die "Write error: $!" unless defined $written;
1051 $bytes_written += $written;
1052 }
1053 }
1054 close $temp_fh;
1055
1056 # Clean up temp tar
1057 unlink $tar->{file};
1058 }
1059
1060 # Close final tar
1061 $final_archive->finish;
1062 $final_fh->close;
1063 print "Wrote to tar archive $final_tar_file\n" unless $q;
1064 }
1065
Akrona3518372024-01-22 23:29:00 +01001066 unless ($q) {
1067 print timestr(timediff(Benchmark->new, $t))."\n";
1068 print "Done.\n";
1069 };
Akron81500102017-04-07 20:45:44 +02001070};
Akron941c1a62016-02-23 17:41:41 +01001071
Nils Diewald2db9ad02013-10-29 19:26:43 +00001072
Akron31a08cb2019-02-20 20:43:26 +01001073# For an archive, this will create the list
1074# of all sigles to process
1075sub set_sigle {
1076 my $archive = shift;
1077
1078 my $prefix = 1;
1079 my @dirs = ();
1080
1081 # No sigles given
1082 unless (@sigle) {
1083
1084 # Get files
1085 foreach ($archive->list_texts) {
1086
1087 push @dirs, $_;
1088
1089 # Split path information
1090 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1091
1092 # TODO: Make this OS independent
1093 push @sigle, join '/', $corpus, $doc, $text;
1094 };
1095 }
1096
1097 # Check sigle for doc sigles
1098 else {
1099 my @new_sigle;
1100
1101 my $prefix_check = 0;
1102
1103 # Iterate over all sigle
1104 foreach (@sigle) {
1105
1106 # Sigle is a doc sigle
1107 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1108
Akrona3518372024-01-22 23:29:00 +01001109 print "$_ ..." unless $q;
Akron31a08cb2019-02-20 20:43:26 +01001110 # Check if a prefix is needed
1111 unless ($prefix_check) {
1112
Akrona3518372024-01-22 23:29:00 +01001113 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001114 print " with prefix ...";
1115 };
1116 $prefix_check = 1;
1117 };
1118
Akrona3518372024-01-22 23:29:00 +01001119 unless ($q) {
1120 print "\n";
Akron31a08cb2019-02-20 20:43:26 +01001121
Akrona3518372024-01-22 23:29:00 +01001122 print '... ' . (
1123 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
Akron955b75b2019-02-21 14:28:41 +01001124 ? '' : 'not '
Akrona3518372024-01-22 23:29:00 +01001125 );
1126 print "extracted.\n";
1127 }
1128 else {
1129 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
1130 };
Akron31a08cb2019-02-20 20:43:26 +01001131 }
1132
1133 # Sigle is a text sigle
1134 else {
1135 push @new_sigle, $_;
1136
1137 unless ($prefix_check) {
1138
Akrona3518372024-01-22 23:29:00 +01001139 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001140 print " with prefix ...";
1141 };
1142 $prefix_check = 1;
1143 };
1144 };
1145 };
1146 @sigle = @new_sigle;
1147 };
1148
1149 return $prefix;
1150};
1151
1152
Akron63f20d42017-04-10 23:40:29 +02001153# Cleanup temporary extraction directory
1154if ($extract_dir) {
1155 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001156 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001157};
1158
1159
1160print "\n";
1161
Nils Diewald2db9ad02013-10-29 19:26:43 +00001162__END__
Akron941c1a62016-02-23 17:41:41 +01001163
1164=pod
1165
1166=encoding utf8
1167
1168=head1 NAME
1169
Akron42f48c12020-02-14 13:08:13 +01001170korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001171
1172
1173=head1 SYNOPSIS
1174
Akron9cb8c982024-03-22 10:46:56 +01001175 $ korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001176
Akron2fd402b2016-10-27 21:26:48 +02001177
Akron941c1a62016-02-23 17:41:41 +01001178=head1 DESCRIPTION
1179
1180L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1181compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001182The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001183
1184
1185=head1 INSTALLATION
1186
1187The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1188
Akron9cb8c982024-03-22 10:46:56 +01001189 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001190
Akronc13a1702016-03-15 19:33:14 +01001191In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001192be available on your command line immediately.
Akron8ce23f72023-12-13 15:48:49 +01001193Minimum requirement for L<KorAP::XML::Krill> is Perl 5.32.
Akroneb370a02022-02-24 13:33:40 +01001194Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1195Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001196In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001197
1198=head1 ARGUMENTS
1199
Akron9cb8c982024-03-22 10:46:56 +01001200 $ korapxml2krill -z --input <directory> --output <filename>
Akrona76d8352016-10-27 16:27:32 +02001201
1202Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001203It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001204
Akron941c1a62016-02-23 17:41:41 +01001205=over 2
1206
1207=item B<archive>
1208
Akron9cb8c982024-03-22 10:46:56 +01001209 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001210
Akron2fd402b2016-10-27 21:26:48 +02001211Converts an archive of KorAP-XML documents. It expects a directory
1212(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001213
1214=item B<extract>
1215
Akron9cb8c982024-03-22 10:46:56 +01001216 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Akrona76d8352016-10-27 16:27:32 +02001217
1218Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001219
Akron63f20d42017-04-10 23:40:29 +02001220=item B<serial>
1221
Akron9cb8c982024-03-22 10:46:56 +01001222 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Akron63f20d42017-04-10 23:40:29 +02001223
Akronce033502024-09-11 10:51:49 +02001224Convert archives in serial. The inputs are not merged but treated
Akron63f20d42017-04-10 23:40:29 +02001225as they are (so they may be premerged or globs).
1226the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001227are created based on the archive name. In case the C<--to-tar> flag is given,
1228the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001229
1230
Akron84b53ad2022-01-14 12:39:15 +01001231=item B<slimlog>
1232
Akron9cb8c982024-03-22 10:46:56 +01001233 $ korapxml2krill slimlog <logfile> > <logfile-slim>
Akron84b53ad2022-01-14 12:39:15 +01001234
1235Filters out all useless aka succesfull information from logs, to simplify
1236log checks. Expects no further options.
1237
1238
Akron941c1a62016-02-23 17:41:41 +01001239=back
1240
1241
1242=head1 OPTIONS
1243
1244=over 2
1245
Akrona76d8352016-10-27 16:27:32 +02001246=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001247
Akrona76d8352016-10-27 16:27:32 +02001248Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001249
Akron7606afa2016-10-25 16:23:49 +02001250Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001251document, while C<archive> expects a KorAP-XML corpus folder or a zip
1252file to batch process multiple files.
1253C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001254
Akrondee3cf62024-06-14 18:14:48 +02001255C<archive> supports multiple input zip files with the constraint
Akron2cfe8092016-06-24 17:48:49 +02001256that the first archive listed contains all primary data files
1257and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001258
Akron7606afa2016-10-25 16:23:49 +02001259 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001260
Akron821db3d2017-04-06 21:19:31 +02001261Input may also be defined using BSD glob wildcards.
1262
1263 -i 'file/news*.zip'
1264
1265The extended input array will be sorted in length order, so the shortest
1266path needs to contain all primary data files and all meta data files.
1267
Akrondee3cf62024-06-14 18:14:48 +02001268(The directory structure follows the base directory format
Akron0c3e3752016-06-28 15:55:53 +02001269that may include a C<.> root folder.
1270In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001271need to be passed with a hash sign in front of the archive's name.
1272This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001273
Akron7606afa2016-10-25 16:23:49 +02001274To support zip files, a version of C<unzip> needs to be installed that is
1275compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001276
Akron7606afa2016-10-25 16:23:49 +02001277B<The root folder switch using the hash sign is experimental and
1278may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001279
Akronf73ffb62018-06-27 12:13:59 +02001280
Akron63f20d42017-04-10 23:40:29 +02001281=item B<--input-base|-ib> <directory>
1282
1283The base directory for inputs.
1284
1285
Akron941c1a62016-02-23 17:41:41 +01001286=item B<--output|-o> <directory|file>
1287
1288Output folder for archive processing or
1289document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001290writes to C<STDOUT> by default
1291(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001292
1293=item B<--overwrite|-w>
1294
1295Overwrite files that already exist.
1296
Akronf73ffb62018-06-27 12:13:59 +02001297
Akron3741f8b2016-12-21 19:55:21 +01001298=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001299
1300Define the default tokenization by specifying
1301the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001302of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001303This will directly take the file instead of running
1304the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001305
Akron3741f8b2016-12-21 19:55:21 +01001306
1307=item B<--base-sentences|-bs> <foundry>#<layer>
1308
1309Define the layer for base sentences.
1310If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001311Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1312layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001313
1314 Defaults to unset.
1315
1316
1317=item B<--base-paragraphs|-bp> <foundry>#<layer>
1318
1319Define the layer for base paragraphs.
1320If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001321Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1322layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001323
1324 Defaults to unset.
1325
1326
Akron41ac10b2017-02-08 22:47:25 +01001327=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1328
1329Define the layer for base pagebreaks.
1330Currently C<DeReKo#Structure> is the only layer supported.
1331
1332 Defaults to unset.
1333
1334
Akron941c1a62016-02-23 17:41:41 +01001335=item B<--skip|-s> <foundry>[#<layer>]
1336
Akronf7ad89e2016-03-16 18:22:47 +01001337Skip specific annotations by specifying the foundry
1338(and optionally the layer with a C<#>-prefix),
1339e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001340Can be set multiple times.
1341
Akronf73ffb62018-06-27 12:13:59 +02001342
Akronc13a1702016-03-15 19:33:14 +01001343=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001344
Akronf7ad89e2016-03-16 18:22:47 +01001345Convert specific annotations by specifying the foundry
1346(and optionally the layer with a C<#>-prefix),
1347e.g. C<Mate> or C<Mate#Morpho>.
1348Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001349
Akronf73ffb62018-06-27 12:13:59 +02001350
Akroned9baf02019-01-22 17:03:25 +01001351=item B<--non-word-tokens|-nwt>
1352
1353Tokenize non-word tokens like word tokens (defined as matching
1354C</[\d\w]/>). Useful to treat punctuations as tokens.
1355
1356 Defaults to unset.
1357
Akronf1849aa2019-12-16 23:35:33 +01001358
1359=item B<--non-verbal-tokens|-nvt>
1360
1361Tokenize non-verbal tokens marked as in the primary data as
1362the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1363
1364 Defaults to unset.
1365
1366
Akron941c1a62016-02-23 17:41:41 +01001367=item B<--jobs|-j>
1368
Akron29128262024-04-17 15:50:36 +02001369Define the number of spawned forks for concurrent jobs
1370of archive processing.
Akron11c80302016-03-18 19:44:43 +01001371Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001372
Akrona472a242023-02-13 13:46:30 +01001373If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001374also apply to extraction.
1375
Akronebbac2e2024-03-22 10:31:23 +01001376Pass C<-1>, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001377times the number of available cores, in case L<Sys::Info>
Akronebbac2e2024-03-22 10:31:23 +01001378is available and can read CPU count (see C<--job-count>).
1379Be aware, that the report of available cores
Akron29128262024-04-17 15:50:36 +02001380may not work in certain conditions. Benchmarking the processing
1381speed based on the number of jobs may be valuable.
Akronebbac2e2024-03-22 10:31:23 +01001382
Akronf7ad89e2016-03-16 18:22:47 +01001383This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001384
Akronf73ffb62018-06-27 12:13:59 +02001385
Akronebbac2e2024-03-22 10:31:23 +01001386=item B<--job-count|-jc>
1387
1388Print job and core information that would be used if
1389C<-1> was passed to C<--jobs>.
1390
1391
Akron263274c2019-02-07 09:48:30 +01001392=item B<--koral|-k>
1393
1394Version of the output format. Supported versions are:
1395C<0> for legacy serialization, C<0.03> for serialization
1396with metadata fields as key-values on the root object,
1397C<0.4> for serialization with metadata fields as a list
1398of C<"@type":"koral:field"> objects.
1399
1400Currently defaults to C<0.03>.
1401
1402
Akron9ec88872017-04-12 16:29:06 +02001403=item B<--sequential-extraction|-se>
1404
1405Flag to indicate, if the C<jobs> value also applies to extraction.
1406Some systems may have problems with extracting multiple archives
1407to the same folder at the same time.
1408Can be flagged using C<--no-sequential-extraction> as well.
1409Defaults to C<false>.
1410
Akronf73ffb62018-06-27 12:13:59 +02001411
Akron35db6e32016-03-17 22:42:22 +01001412=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001413
Akron35db6e32016-03-17 22:42:22 +01001414Define the metadata parser to use. Defaults to C<I5>.
1415Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1416This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001417
Akronf73ffb62018-06-27 12:13:59 +02001418
Akron941c1a62016-02-23 17:41:41 +01001419=item B<--gzip|-z>
1420
Akronf7ad89e2016-03-16 18:22:47 +01001421Compress the output.
1422Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001423
Akronf73ffb62018-06-27 12:13:59 +02001424
Akron11c80302016-03-18 19:44:43 +01001425=item B<--cache|-c>
1426
1427File to mmap a cache (using L<Cache::FastMmap>).
1428Defaults to C<korapxml2krill.cache> in the calling directory.
1429
Akronf73ffb62018-06-27 12:13:59 +02001430
Akron11c80302016-03-18 19:44:43 +01001431=item B<--cache-size|-cs>
1432
1433Size of the cache. Defaults to C<50m>.
1434
Akronf73ffb62018-06-27 12:13:59 +02001435
Akron11c80302016-03-18 19:44:43 +01001436=item B<--cache-init|-ci>
1437
1438Initialize cache file.
1439Can be flagged using C<--no-cache-init> as well.
1440Defaults to C<true>.
1441
Akronf73ffb62018-06-27 12:13:59 +02001442
Akron11c80302016-03-18 19:44:43 +01001443=item B<--cache-delete|-cd>
1444
1445Delete cache file after processing.
1446Can be flagged using C<--no-cache-delete> as well.
1447Defaults to C<true>.
1448
Akronf73ffb62018-06-27 12:13:59 +02001449
Akron636aa112017-04-07 18:48:56 +02001450=item B<--config|-cfg>
1451
1452Configure the parameters of your call in a file
1453of key-value pairs with whitespace separator
1454
1455 overwrite 1
1456 token DeReKo#Structure
1457 ...
1458
1459Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001460C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akronc0ac4ff2024-04-15 18:03:15 +02001461C<token>, C<log>,
1462C<cache>, C<cache-size>, C<cache-init>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001463C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001464C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001465C<base-sentences>, C<base-paragraphs>,
1466C<base-pagebreaks>,
1467C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001468(semicolon separated), C<anno> (semicolon separated).
1469
Akronf73ffb62018-06-27 12:13:59 +02001470Configuration parameters will always be overwritten by
1471passed parameters.
1472
1473
Akron81500102017-04-07 20:45:44 +02001474=item B<--temporary-extract|-te>
1475
Akrona472a242023-02-13 13:46:30 +01001476Only valid for the C<archive> and C<serial>
1477commands.
Akron81500102017-04-07 20:45:44 +02001478
1479This will first extract all files into a
1480directory and then will archive.
1481If the directory is given as C<:temp:>,
1482a temporary directory is used.
1483This is especially useful to avoid
1484massive unzipping and potential
1485network latency.
Akron636aa112017-04-07 18:48:56 +02001486
Akronf73ffb62018-06-27 12:13:59 +02001487
Akronc93a0802019-07-11 15:48:34 +02001488=item B<--to-tar>
1489
1490Only valid for the C<archive> command.
1491
1492Writes the output into a tar archive.
1493
1494
Akrone10ad322016-02-27 10:54:26 +01001495=item B<--sigle|-sg>
1496
Akron20807582016-10-26 17:11:34 +02001497Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001498Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001499I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001500Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001501In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001502On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001503
Akron64f7fae2022-07-27 12:45:33 +02001504=item B<--lang>
1505
1506Preferred language for metadata fields. In case multiple titles are
1507given (on any level) with different C<xml:lang> attributes,
1508the language given is preferred.
1509Because titles may have different sources and different priorities,
1510non-specific language titles may still be preferred in case the title
1511source has a higher priority.
1512
Akronf73ffb62018-06-27 12:13:59 +02001513
Akron941c1a62016-02-23 17:41:41 +01001514=item B<--log|-l>
1515
Akronb9c33812020-10-21 16:19:35 +02001516The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001517
Akronf73ffb62018-06-27 12:13:59 +02001518
Akrona3518372024-01-22 23:29:00 +01001519=item B<--quiet>
1520
1521Silence all information (non-log) outputs.
1522
1523
Akron941c1a62016-02-23 17:41:41 +01001524=item B<--help|-h>
1525
Akron42f48c12020-02-14 13:08:13 +01001526Print help information.
Akron941c1a62016-02-23 17:41:41 +01001527
Akronf73ffb62018-06-27 12:13:59 +02001528
Akron941c1a62016-02-23 17:41:41 +01001529=item B<--version|-v>
1530
1531Print version information.
1532
1533=back
1534
Akron311e29b2024-09-11 11:46:09 +02001535=head1 PERFORMANCE
1536
1537There are some ways to improve performance for large tasks:
1538
1539=item First unpack
1540
1541Using the archive or serial command on one or multiple zip files
1542can be very slow, as it needs to unpack small portions every time.
1543It's better to use C<--temporary-extract> to unpack the whole archive
1544first into a temprary directory and then read the extracted files.
1545This is especially important for remote archives
1546
1547=item Limit annotations
1548
1549Per default, all supported annotation layers are sought. This can be limited
1550by adding C<--skip '#ALL'> and only listing the expected annotations with C<--anno>.
1551
1552=item Checking the parallel job count
1553
1554By providing the number of parallel jobs using C<--jobs>, the execution can be tailored to specific
1555hardware environments.
1556
Marc Kupietzaeac7532025-04-14 20:00:33 +02001557=item Install ripunzip
1558
1559For full extraction of data, L<ripunzip|https://github.com/google/ripunzip> can be
1560used for improved performance.
1561
1562
Akronc13a1702016-03-15 19:33:14 +01001563=head1 ANNOTATION SUPPORT
1564
1565L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1566developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1567The base foundry with paragraphs, sentences, and the text element are mandatory for
1568L<Krill|https://github.com/KorAP/Krill>.
1569
Akron821db3d2017-04-06 21:19:31 +02001570 Base
1571 #Paragraphs
1572 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001573
Akron821db3d2017-04-06 21:19:31 +02001574 Connexor
1575 #Morpho
1576 #Phrase
1577 #Sentences
1578 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001579
Akron821db3d2017-04-06 21:19:31 +02001580 CoreNLP
1581 #Constituency
1582 #Morpho
1583 #NamedEntities
1584 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001585
Akron5530a552022-02-17 17:53:15 +01001586 CorpusExplorer
1587 #Morpho
1588
Akronce125b62017-06-19 11:54:36 +02001589 CMC
1590 #Morpho
1591
Akron821db3d2017-04-06 21:19:31 +02001592 DeReKo
1593 #Structure
Akronc13a1702016-03-15 19:33:14 +01001594
Akron57510c12019-01-04 14:58:53 +01001595 DGD
1596 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001597 #Structure
Akron57510c12019-01-04 14:58:53 +01001598
Akron821db3d2017-04-06 21:19:31 +02001599 DRuKoLa
1600 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001601
Akron821db3d2017-04-06 21:19:31 +02001602 Glemm
1603 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001604
Akronabb36902021-10-11 15:51:06 +02001605 Gingko
1606 #Morpho
1607
Akronea1aed52018-07-19 14:43:34 +02001608 HNC
1609 #Morpho
1610
Akron4c679192018-01-16 17:41:49 +01001611 LWC
1612 #Dependency
1613
Akron821db3d2017-04-06 21:19:31 +02001614 Malt
1615 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001616
Akron821db3d2017-04-06 21:19:31 +02001617 MarMoT
1618 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001619
Akron821db3d2017-04-06 21:19:31 +02001620 Mate
1621 #Dependency
1622 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001623
Akron821db3d2017-04-06 21:19:31 +02001624 MDParser
1625 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001626
Akrone85a7762022-07-22 08:05:03 +02001627 NKJP
1628 #Morpho
1629 #NamedEntities
1630
Akron821db3d2017-04-06 21:19:31 +02001631 OpenNLP
1632 #Morpho
1633 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001634
Akron07e24772020-04-23 14:00:54 +02001635 RWK
1636 #Morpho
1637 #Structure
1638
Akron821db3d2017-04-06 21:19:31 +02001639 Sgbr
1640 #Lemma
1641 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001642
Marc Kupietzb8c53822024-03-16 18:54:08 +01001643 Spacy
1644 #Morpho
1645
Akron7d5e6382019-08-08 16:36:27 +02001646 Talismane
1647 #Dependency
1648 #Morpho
1649
Akron821db3d2017-04-06 21:19:31 +02001650 TreeTagger
1651 #Morpho
1652 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001653
Akron83aedd32023-02-07 10:57:41 +01001654 UDPipe
1655 #Dependency
1656 #Morpho
1657
Akron821db3d2017-04-06 21:19:31 +02001658 XIP
1659 #Constituency
1660 #Morpho
1661 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001662
Akronc13a1702016-03-15 19:33:14 +01001663
1664More importers are in preparation.
1665New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1666See the built-in annotation importers as examples.
1667
Akronf73ffb62018-06-27 12:13:59 +02001668
Akron41e6c8b2021-10-14 20:22:18 +02001669=head1 METADATA SUPPORT
1670
1671L<KorAP::XML::Krill> has built-in importer for some meta data variants
Akron4b001ce2024-06-06 12:32:11 +02001672that are part of the KorAP preprocessing pipeline.
Akron41e6c8b2021-10-14 20:22:18 +02001673
1674=over 2
1675
Akron1d101492024-06-06 12:47:35 +02001676=item B<I5>
Akron41e6c8b2021-10-14 20:22:18 +02001677
Akron1d101492024-06-06 12:47:35 +02001678Meta data for all I5 files
Akron41e6c8b2021-10-14 20:22:18 +02001679
Akron1d101492024-06-06 12:47:35 +02001680=item B<Sgbr>
Akron41e6c8b2021-10-14 20:22:18 +02001681
Akron1d101492024-06-06 12:47:35 +02001682Meta data from the Schreibgebrauch project
Akron2532f1b2023-05-15 13:41:24 +02001683
Akron1d101492024-06-06 12:47:35 +02001684=item B<Gingko>
1685
1686Meta data from the Gingko project in addition to I5
1687
1688=item B<ICC>
1689
1690Meta data for the ICC in addition to I5
1691
1692=item B<NKJP>
1693
1694Meta data for the NKJP corpora
Akron24ad3c02024-06-03 12:38:20 +02001695
Akron41e6c8b2021-10-14 20:22:18 +02001696=back
1697
Akron41e6c8b2021-10-14 20:22:18 +02001698New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1699See the built-in meta data importers as examples.
1700
Akron4b001ce2024-06-06 12:32:11 +02001701The I5 metadata definition is based on TEI-P5 and supports C<E<lt>xenoDataE<gt>>
Akron82064bb2024-06-17 12:53:23 +02001702with C<E<lt>metaE<gt>> elements like
Akron4b001ce2024-06-06 12:32:11 +02001703
1704 <meta type="..." name="..." project="..." desc="...">...</meta>
1705
1706that are directly translated to Krill objects. The supported values are:
1707
1708=over 2
1709
Akron1d101492024-06-06 12:47:35 +02001710=item C<type>
Akron4b001ce2024-06-06 12:32:11 +02001711
1712=over 4
1713
Akron1d101492024-06-06 12:47:35 +02001714=item C<string>
Akron4b001ce2024-06-06 12:32:11 +02001715
Akron1d101492024-06-06 12:47:35 +02001716String meta data value
Akron4b001ce2024-06-06 12:32:11 +02001717
Akron1d101492024-06-06 12:47:35 +02001718=item C<keyword>
Akron4b001ce2024-06-06 12:32:11 +02001719
Akrondee3cf62024-06-14 18:14:48 +02001720String meta data value that can be given multiple times
Akron4b001ce2024-06-06 12:32:11 +02001721
Akron1d101492024-06-06 12:47:35 +02001722=item C<text>
Akron4b001ce2024-06-06 12:32:11 +02001723
Akrondee3cf62024-06-14 18:14:48 +02001724String meta data value that is tokenized and can be searched as token sequences
Akron4b001ce2024-06-06 12:32:11 +02001725
Akron1d101492024-06-06 12:47:35 +02001726=item C<date>
1727
1728Date meta data value (as "yyyy/mm/dd" with optional granularity)
1729
1730=item C<integer>
1731
1732Numerical meta data value
1733
Akrondee3cf62024-06-14 18:14:48 +02001734=item C<attachment>
Akron1d101492024-06-06 12:47:35 +02001735
1736Non-indexed meta data value (only retrievable)
1737
1738=item C<uri>
1739
1740Non-indexed attached URI, takes the desc as the title for links
Akron4b001ce2024-06-06 12:32:11 +02001741
1742=back
1743
Akron1d101492024-06-06 12:47:35 +02001744=item C<name>
Akron4b001ce2024-06-06 12:32:11 +02001745
Akrondee3cf62024-06-14 18:14:48 +02001746The key of the meta object that may be prefixed by C<corpus> or C<doc>, in case the
Akron693f5882024-06-06 12:52:39 +02001747C<E<lt>xenoDataE<gt>> information is located on these levels. The text level introduces
1748no prefixes.
Akron4b001ce2024-06-06 12:32:11 +02001749
Akron1d101492024-06-06 12:47:35 +02001750=item C<project> (optional)
Akron4b001ce2024-06-06 12:32:11 +02001751
Akron1d101492024-06-06 12:47:35 +02001752A prefixed namespace of the key
1753
1754=item C<desc> (optional)
1755
1756A description of the key
1757
1758=item text content
1759
1760The value of the meta object
Akron4b001ce2024-06-06 12:32:11 +02001761
1762=back
1763
Akron41e6c8b2021-10-14 20:22:18 +02001764
Akron8f69d632020-01-15 16:58:11 +01001765=head1 About KorAP-XML
1766
1767KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1768data model (Bański et al. 2013), where text data are stored physically
1769separated from their interpretations (i.e. annotations).
1770A text document in KorAP-XML therefore consists of several files
1771containing primary data, metadata and annotations.
1772
1773The structure of a single KorAP-XML document can be as follows:
1774
1775 - data.xml
1776 - header.xml
1777 + base
1778 - tokens.xml
1779 - ...
1780 + struct
1781 - structure.xml
1782 - ...
1783 + corenlp
1784 - morpho.xml
1785 - constituency.xml
1786 - ...
1787 + tree_tagger
1788 - morpho.xml
1789 - ...
1790 - ...
1791
1792The C<data.xml> contains the primary data, the C<header.xml> contains
1793the metadata, and the annotation layers are stored in subfolders
1794like C<base>, C<struct> or C<corenlp>
1795(so-called "foundries"; Bański et al. 2013).
1796
1797Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001798(Lüngen and Sperberg-McQueen 2012). See the documentation in
1799L<KorAP::XML::Meta::I5> for translatable fields.
1800
1801Annotations correspond to a variant of the TEI-P5 feature structures
1802(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001803Annotation feature structures refer to character sequences of the primary text
1804inside the C<text> element of the C<data.xml>.
1805A single annotation containing the lemma of a token can have the following structure:
1806
1807 <span from="0" to="3">
1808 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1809 <f name="lex">
1810 <fs>
1811 <f name="lemma">zum</f>
1812 </fs>
1813 </f>
1814 </fs>
1815 </span>
1816
1817The C<from> and C<to> attributes are refering to the character span
1818in the primary text.
1819Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1820the structure may vary. See L<KorAP::XML::Annotation::*> for various
1821annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001822
1823Multiple KorAP-XML documents are organized on three levels following
1824the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1825corpus E<gt> document E<gt> text. On each level metadata information
1826can be stored, that C<korapxml2krill> will merge to a single metadata
1827object per text. A corpus is therefore structured as follows:
1828
1829 + <corpus>
1830 - header.xml
1831 + <document>
1832 - header.xml
1833 + <text>
1834 - data.xml
1835 - header.xml
1836 - ...
1837 - ...
1838
1839A single text can be identified by the concatenation of
1840the corpus identifier, the document identifier and the text identifier.
1841This identifier is called the text sigle
1842(e.g. a text with the identifier C<18486> in the document C<060> in the
1843corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1844
1845These corpora are often stored in zip files, with which C<korapxml2krill>
1846can deal with. Corpora may also be split in multiple zip archives
1847(e.g. one zip file per foundry), which is also supported (see C<--input>).
1848
1849Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1850in form of a test suite.
1851The resulting JSON format merges all annotation layers
1852based on a single token stream.
1853
1854=head2 References
1855
1856Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1857KorAP data model: first approximation, December.
1858
1859Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1860"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1861Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1862L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1863
1864Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1865"Robust corpus architecture: a new look at virtual collections and data access",
1866Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1867L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1868
1869Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1870Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1871"Towards an international standard on featurestructure representation",
1872Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1873pp. 373-376.
1874L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1875
1876Harald Lüngen and C. M. Sperberg-McQueen (2012):
1877"A TEI P5 Document Grammar for the IDS Text Model",
1878Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1879L<PDF|https://journals.openedition.org/jtei/pdf/508>
1880
1881TEI Consortium, eds:
1882"Feature Structures",
1883Guidelines for Electronic Text Encoding and Interchange.
1884L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1885
Akron941c1a62016-02-23 17:41:41 +01001886=head1 AVAILABILITY
1887
1888 https://github.com/KorAP/KorAP-XML-Krill
1889
1890
1891=head1 COPYRIGHT AND LICENSE
1892
Akrona3518372024-01-22 23:29:00 +01001893Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001894
Akron6882d7d2021-02-08 09:43:57 +01001895Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001896
Akron29128262024-04-17 15:50:36 +02001897Contributor: Eliza Margaretha, Marc Kupietz
Akron941c1a62016-02-23 17:41:41 +01001898
Akron6882d7d2021-02-08 09:43:57 +01001899L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001900Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001901L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001902member of the
Akronf1849aa2019-12-16 23:35:33 +01001903L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001904
1905This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001906L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001907
1908=cut