blob: a40d75afc10850f1fbb42373d3a0f3f1f44582fa [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron7d4d2d72024-09-05 11:05:35 +02004use v5.32;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010018use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010019use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020020use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020021use Config::Simple;
Akron0a0d1f92024-11-14 14:31:42 +010022use Path::Iterator::Rule;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Marc Kupietzb8c53822024-03-16 18:54:08 +0100174# 2024/03/20
175# - Added Spacy support.
176#
Akronebbac2e2024-03-22 10:31:23 +0100177# 2024/03/22
178# - Improve core count logging.
Akron941c1a62016-02-23 17:41:41 +0100179# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100180
Akron0a0d1f92024-11-14 14:31:42 +0100181our $LAST_CHANGE = '2024/11/14';
Akron941c1a62016-02-23 17:41:41 +0100182our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100183our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100184our $VERSION_MSG = <<"VERSION";
185Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
186VERSION
187
Akron941c1a62016-02-23 17:41:41 +0100188# Parse comand
189my $cmd;
190our @ARGV;
191if ($ARGV[0] && index($ARGV[0], '-') != 0) {
192 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100193};
Akron63f20d42017-04-10 23:40:29 +0200194my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100195
Akron5f51d422016-08-16 16:26:43 +0200196my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200197
198# Configuration hash
199my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100200
Akronebbac2e2024-03-22 10:31:23 +0100201# Count jobs/cores if not set
202sub count_jobs {
203 my ($cores, $jobs);
204 my $msg = 'Unable to determine number of cores - set to 1';
205 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
206 $cores = Sys::Info->new->device('CPU')->count;
207 if ($cores <= 0) {
208 $log->error($msg);
209 $cores = 1;
210 }
211 }
212 else {
213 $log->error($msg);
214 $cores = 1;
215 };
216
217 $jobs = ceil(5 * $cores);
218 return $jobs, "Run using $jobs jobs on $cores cores";
219}
220
Akron941c1a62016-02-23 17:41:41 +0100221# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000222GetOptions(
Akron08385f62016-03-22 20:37:04 +0100223 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200224 'input-base|ib=s' => \($cfg{input_base}),
225 'output|o=s' => \($cfg{output}),
226 'overwrite|w' => \($cfg{overwrite}),
227 'meta|m=s' => \($cfg{meta}),
228 'token|t=s' => \($cfg{token}),
229 'base-sentences|bs=s' => \($cfg{base_sentences}),
230 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
231 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
232 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100233 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100234 'skip|s=s' => \@skip,
235 'sigle|sg=s' => \@sigle,
Akronc0ac4ff2024-04-15 18:03:15 +0200236 'cache|c=s' => \($cfg{cache}),
Akron636aa112017-04-07 18:48:56 +0200237 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200238 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200239 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200240 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200241 'primary|p!' => sub {
242 warn 'Primary flag no longer supported!';
243 },
Akrona3518372024-01-22 23:29:00 +0100244 'quiet' => \($cfg{quiet}),
Akron6aed0562020-08-07 16:46:00 +0200245 'pretty|y' => sub {
246 warn 'Pretty flag no longer supported!';
247 },
Akronf8df2162020-08-07 15:03:39 +0200248 'jobs|j=i' => \($cfg{jobs}),
249 'koral|k=f' => \($cfg{koral}),
250 'to-tar' => \($cfg{to_tar}),
251 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
252 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
253 'sequential-extraction|se' => \($cfg{sequential_extraction}),
254 'cache-size|cs=s' => \($cfg{cache_size}),
255 'cache-delete|cd!' => \($cfg{cache_delete}),
256 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100257 'help|h' => sub {
258 pod2usage(
259 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200260 -verbose => 99,
261 -msg => $VERSION_MSG,
262 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100263 );
264 },
265 'version|v' => sub {
266 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200267 -verbose => 0,
268 -msg => $VERSION_MSG,
269 -output => '-'
Akronebbac2e2024-03-22 10:31:23 +0100270 ),
271 },
272 'job-count|jc' => sub {
273 my ($j, $msg) = count_jobs();
274 pod2usage(
275 -verbose => 0,
276 -msg => $msg,
277 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100278 )
279 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000280);
281
Akrone512b7c2020-08-07 16:16:12 +0200282my %ERROR_HASH = (
283 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
284 -verbose => 99,
285 -msg => $VERSION_MSG,
286 -output => '-',
287 -exit => 1
288);
Akron63f20d42017-04-10 23:40:29 +0200289
Akronf8df2162020-08-07 15:03:39 +0200290# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200291if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200292 my %config;
293
Akronf8df2162020-08-07 15:03:39 +0200294 print "Reading config from $cfg_file\n";
295
Akron636aa112017-04-07 18:48:56 +0200296 Config::Simple->import_from($cfg_file, \%config);
297
Akronf8df2162020-08-07 15:03:39 +0200298 foreach (qw!output cache-size input-base token overwrite
299 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200300 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100301 non-verbal-tokens sequential-extraction
Akronc0ac4ff2024-04-15 18:03:15 +0200302 temporary-extract cache-init cache-delete
Akrona3518372024-01-22 23:29:00 +0100303 koral extract-dir jobs quiet!) {
Akronf8df2162020-08-07 15:03:39 +0200304 my $underlined = $_ =~ tr/-/_/r;
305 if (!defined($cfg{$underlined}) && defined $config{$_}) {
306 $cfg{$underlined} = $config{$_};
307 };
Akron636aa112017-04-07 18:48:56 +0200308 };
309
310 # Skip
311 if (!scalar(@skip) && defined $config{'skip'}) {
312 @skip = split /\s*;\s*/, $config{'skip'} ;
313 };
314
315 # Sigle
316 if (!scalar(@sigle) && defined $config{'sigle'}) {
317 @sigle = split /\s*;\s*/, $config{'sigle'} ;
318 };
319
320 # Anno
321 if (!scalar(@anno) && defined $config{'anno'}) {
322 @anno = split /\s*;\s*/, $config{'anno'} ;
323 };
324};
325
Akronf8df2162020-08-07 15:03:39 +0200326# Init variables and set default values
327my $output = $cfg{output};
328my $input_base = $cfg{input_base};
329my $gzip = $cfg{gzip};
330my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100331my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200332my $token_base = $cfg{token} // 'OpenNLP#tokens';
333my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
334my $jobs = $cfg{jobs} // 0;
335my $cache_delete = $cfg{cache_delete} // 1;
336my $base_sentences = lc($cfg{base_sentences} // '');
337my $base_paragraphs = lc($cfg{base_paragraphs} // '');
338my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
339my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akrona3518372024-01-22 23:29:00 +0100340my $q = !!($cfg{quiet}) // 0;
Akron63f20d42017-04-10 23:40:29 +0200341
Akronf8df2162020-08-07 15:03:39 +0200342# Get tokenization basis
343my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200344
Akronf8df2162020-08-07 15:03:39 +0200345# Remove file extension
346$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100347
Akronf8df2162020-08-07 15:03:39 +0200348# Convert sigle to path construct
349s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
350
351my %skip;
352$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200353
Akronb9c33812020-10-21 16:19:35 +0200354Log::Any::Adapter->set(
355 'Stderr', log_level => uc($cfg{log} // 'ERROR')
356);
Akron63f20d42017-04-10 23:40:29 +0200357
Akron84b53ad2022-01-14 12:39:15 +0100358# Start log slimming
359if ($cmd && $cmd eq 'slimlog') {
360 require KorAP::XML::Log::Slim;
361
362 my $log_file = shift @ARGV;
363
364 if (-e $log_file) {
365
366 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
367
368 # Run log filter
369 $slimmer->slim_to;
370 }
371
372 else {
373 warn "Log file can't be found";
374 exit(1);
375 };
376
377 exit;
378};
379
380
Akronf8df2162020-08-07 15:03:39 +0200381if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
382 $log->error("Directory '$output' does not exist.");
383 exit 1;
384};
Akron63f20d42017-04-10 23:40:29 +0200385
Akron941c1a62016-02-23 17:41:41 +0100386# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100387pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000388
Akrone1dbc382016-07-08 22:24:52 +0200389# Gzip has no effect, if no output is given
390pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000391
Akron63f20d42017-04-10 23:40:29 +0200392# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200393if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200394
Akron63f20d42017-04-10 23:40:29 +0200395 # Remove all inputs
396 my $remove_next = 0;
397 @keep_argv = @{c(@keep_argv)->grep(
398 sub {
399 # Input flag
400 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
401 $remove_next = 1;
402 return 0;
403 }
404
405 # input value
406 elsif ($remove_next) {
407 $remove_next = 0;
408 return 0;
409 };
410
411 # Pass parameter
412 return 1;
413 }
414 )->to_array};
415
416
417 # Iterate over all inputs
418 foreach (@input) {
419
Akron081639e2017-04-21 19:01:39 +0200420 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200421 my $new_out = catdir($output, get_file_name_from_glob($_));
422
Akron486f9ab2017-04-22 23:25:19 +0200423 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200424 unless ($to_tar) {
425 if (make_path($new_out) == 0 && !-d $new_out) {
426 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200427 exit 1;
Akron081639e2017-04-21 19:01:39 +0200428 };
Akron63f20d42017-04-10 23:40:29 +0200429 };
430
431 # Create archive command
432 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
Akrona3518372024-01-22 23:29:00 +0100433 unless ($q) {
434 print "Start serial processing of $_ to $new_out\n";
435 print 'Command: ', join(' ', @archive_cmd), "\n";
436 };
Akron63f20d42017-04-10 23:40:29 +0200437
438 # Start archiving
439 system @archive_cmd;
440 };
441
Akron3abc03e2017-06-29 16:23:35 +0200442 exit;
Akron63f20d42017-04-10 23:40:29 +0200443};
444
Akron5c602cb2020-08-07 17:00:52 +0200445# Define supported (and preinstalled) transformation modules
446my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100447push(@layers, ['Base', 'Sentences']) unless $base_sentences;
448push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200449
450# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200451push(@layers, ['Connexor', 'Morpho'],
452 ['Connexor', 'Syntax'],
453 ['Connexor', 'Phrase'],
454 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200455
456# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200457push(@layers,
458 ['CoreNLP', 'NamedEntities'],
459 ['CoreNLP', 'Sentences'],
460 ['CoreNLP', 'Morpho'],
461 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200462
Akron5530a552022-02-17 17:53:15 +0100463# CorpusExplorer
464push(@layers,
465 ['CorpusExplorer', 'Morpho']);
466
Akronce125b62017-06-19 11:54:36 +0200467# CMC
468push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100469
Akrone1dbc382016-07-08 22:24:52 +0200470# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100471my @dereko_attr = ();
472if ($base_sentences eq 'dereko#structure') {
473 push @dereko_attr, 'sentences';
474};
475if ($base_paragraphs eq 'dereko#structure') {
476 push @dereko_attr, 'paragraphs';
477};
Akron636bd9c2017-02-09 17:13:00 +0100478
Akron41ac10b2017-02-08 22:47:25 +0100479if ($base_pagebreaks eq 'dereko#structure') {
480 push @dereko_attr, 'pagebreaks';
481};
482
483if ($dereko_attr[0]) {
484 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100485}
486else {
487 push(@layers, ['DeReKo', 'Structure']);
488};
Akrone1dbc382016-07-08 22:24:52 +0200489
Akron57510c12019-01-04 14:58:53 +0100490# DGD
491push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100492if ($base_sentences eq 'dgd#structure') {
493 push(@layers, ['DGD', 'Structure', 'base-sentence']);
494}
Akron57510c12019-01-04 14:58:53 +0100495
496# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200497push(@layers,
498 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100499
Akronabb36902021-10-11 15:51:06 +0200500# Gingko
501push(@layers,
502 ['Gingko', 'Morpho']);
503
Akrone1dbc382016-07-08 22:24:52 +0200504# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200505push(@layers,
506 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200507
Akronea1aed52018-07-19 14:43:34 +0200508# HNC
Akron5c602cb2020-08-07 17:00:52 +0200509push(@layers,
510 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200511
Akron4c679192018-01-16 17:41:49 +0100512# LWC
Akron5c602cb2020-08-07 17:00:52 +0200513push(@layers,
514 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100515
Akrone1dbc382016-07-08 22:24:52 +0200516# Malt
Akron5c602cb2020-08-07 17:00:52 +0200517push(@layers,
518 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200519
Akron57510c12019-01-04 14:58:53 +0100520# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200521push(@layers,
522 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200523
524# Mate
Akron5c602cb2020-08-07 17:00:52 +0200525push(@layers,
526 ['Mate', 'Morpho'],
527 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200528
Akron57510c12019-01-04 14:58:53 +0100529# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200530push(@layers,
531 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100532
Akron88d063a2022-03-21 15:10:01 +0100533# NKJP
534push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200535 ['NKJP', 'Morpho'],
536 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100537
Akrone1dbc382016-07-08 22:24:52 +0200538# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200539push(@layers,
540 ['OpenNLP', 'Morpho'],
541 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200542
Akron07e24772020-04-23 14:00:54 +0200543# Redewiedergabe
544push(@layers, ['RWK', 'Morpho']);
545if ($base_sentences eq 'rwk#structure') {
546 push(@layers, ['RWK', 'Structure']);
547};
548
Akrone1dbc382016-07-08 22:24:52 +0200549# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200550push(@layers,
551 ['Sgbr', 'Lemma'],
552 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200553
Marc Kupietzb8c53822024-03-16 18:54:08 +0100554# Spacy
555push(@layers,
556 ['Spacy', 'Morpho']);
557
Akron7d5e6382019-08-08 16:36:27 +0200558# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200559push(@layers,
560 ['Talismane', 'Dependency'],
561 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200562
Akrone1dbc382016-07-08 22:24:52 +0200563# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200564push(@layers,
565 ['TreeTagger', 'Morpho'],
566 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200567
Marc Kupietz400590b2022-12-23 16:02:36 +0100568# UDPipe
569push(@layers,
570 ['UDPipe', 'Morpho'],
571 ['UDPipe', 'Dependency']);
572
Akrone1dbc382016-07-08 22:24:52 +0200573# XIP
Akron5c602cb2020-08-07 17:00:52 +0200574push(@layers,
575 ['XIP', 'Morpho'],
576 ['XIP', 'Constituency'],
577 ['XIP', 'Sentences'],
578 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200579
Akron4fa37c32017-01-20 14:43:10 +0100580
Akrone1dbc382016-07-08 22:24:52 +0200581# Check filters
582my @filtered_anno;
583if ($skip{'#all'}) {
584 foreach (@anno) {
585 push @filtered_anno, [ split('#', $_) ];
586 };
587}
588
589# Add all annotations that are not skipped
590else {
591 # Add to index file - respect skipping
592 foreach my $info (@layers) {
593 # Skip if Foundry or Foundry#Layer should be skipped
594 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
595 push @filtered_anno, $info;
596 };
597 };
598};
599
Akrone1dbc382016-07-08 22:24:52 +0200600
601# TODO: This should not be initialized for batch
602my $cache = Cache::FastMmap->new(
603 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200604 cache_size => ($cfg{cache_size} // '50m'),
Akronc0ac4ff2024-04-15 18:03:15 +0200605 init_file => ($cfg{cache_init} // 1),
606 unlink_on_exit => $cache_delete
Akrone1dbc382016-07-08 22:24:52 +0200607);
608
Akron03b24db2016-08-16 20:54:32 +0200609# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200610my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200611 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200612 meta_type => $cfg{meta},
613 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200614 foundry => $token_base_foundry,
615 layer => $token_base_layer,
616 gzip => $gzip,
617 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200618 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100619 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200620 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200621 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
622 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200623);
624
Akrone512b7c2020-08-07 16:16:12 +0200625# Auto adjust jobs
626if ($jobs eq '-1') {
Akronebbac2e2024-03-22 10:31:23 +0100627 ($jobs, my $msg) = count_jobs();
628 print $msg . "\n" unless $q;
Akrone512b7c2020-08-07 16:16:12 +0200629};
630
Akron63f20d42017-04-10 23:40:29 +0200631# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200632if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200633
Akron821db3d2017-04-06 21:19:31 +0200634 my @new_input = ();
635
636 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200637 foreach my $wild_card (@input) {
638
639 # Prefix with input root
640 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
641
642 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200643 };
644
Akron63f20d42017-04-10 23:40:29 +0200645 # Sort files by length
646 @input = sort { length($a) <=> length($b) } @new_input;
647
Akrona3518372024-01-22 23:29:00 +0100648 print 'Input is ' . join(', ', @input)."\n" unless $q;
Akron821db3d2017-04-06 21:19:31 +0200649};
650
Akron941c1a62016-02-23 17:41:41 +0100651# Process a single file
652unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100653 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000654
Akron941c1a62016-02-23 17:41:41 +0100655 BEGIN {
656 $main::TIME = Benchmark->new;
657 $main::LAST_STOP = Benchmark->new;
658 };
659
660 sub stop_time {
661 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200662 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100663 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200664 timestr(timediff($new, $main::LAST_STOP)) .
665 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
666 );
Akron941c1a62016-02-23 17:41:41 +0100667 $main::LAST_STOP = $new;
668 };
669
670 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200671 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100672
Akron7d4cdd82016-08-17 21:39:45 +0200673 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200674 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100675
Akron5f51d422016-08-16 16:26:43 +0200676 stop_time;
Akronc0ac4ff2024-04-15 18:03:15 +0200677
Akron3abc03e2017-06-29 16:23:35 +0200678 exit;
Akron81500102017-04-07 20:45:44 +0200679};
680
Nils Diewald59094f22014-11-05 18:20:50 +0000681
Akrone10ad322016-02-27 10:54:26 +0100682# Extract XML files
Akron81500102017-04-07 20:45:44 +0200683if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100684
Akrond5643ad2017-07-04 20:27:13 +0200685 # Output is required
686 pod2usage(%ERROR_HASH) unless $output;
687
Akron7d4cdd82016-08-17 21:39:45 +0200688 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200689 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100690
Akron7d4cdd82016-08-17 21:39:45 +0200691 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100692 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200693 $log->error("Unzip is not installed or incompatible.");
694 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100695 };
696
Akronb0c88db2016-06-29 16:33:18 +0200697 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200698 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200699
Akron31a08cb2019-02-20 20:43:26 +0100700 # Will set @sigle
701 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200702
Akrone10ad322016-02-27 10:54:26 +0100703 # Iterate over all given sigles and extract
704 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100705
Akrona3518372024-01-22 23:29:00 +0100706 unless ($q) {
707 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200708
Akrona3518372024-01-22 23:29:00 +0100709 # TODO: Make this OS independent
710 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100711
Akrona3518372024-01-22 23:29:00 +0100712 # TODO:
713 # - prefix???
714 $archive->extract_sigle(0, [$_], $output, $jobs)
715 ? '' : 'not '
716 );
717 print "extracted.\n";
718 } else {
719 $archive->extract_sigle(1, [$_], $output, $jobs);
720 }
Akrone10ad322016-02-27 10:54:26 +0100721 };
Akronb0c88db2016-06-29 16:33:18 +0200722 }
Akron7d4cdd82016-08-17 21:39:45 +0200723
724 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200725 else {
726 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200727 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100728 };
729}
730
Akron81500102017-04-07 20:45:44 +0200731
Akron941c1a62016-02-23 17:41:41 +0100732# Process an archive
733elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000734
Akron81500102017-04-07 20:45:44 +0200735 my $archive_output;
736
737 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100738 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200739
740 # Create new archive object
741 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
742
743 # Check zip capabilities
744 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200745 $log->error("Unzip is not installed or incompatible.");
746 exit 1;
Akron81500102017-04-07 20:45:44 +0200747 };
748
749 # Add further annotation archived
750 $archive->attach($_) foreach @input[1..$#input];
751
752 # Create a temporary directory
753 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200754 $extract_dir = tempdir(CLEANUP => 0);
Akrona3518372024-01-22 23:29:00 +0100755 print "Temporarily extract to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200756 };
757
Akron63f20d42017-04-10 23:40:29 +0200758 # Add some random extra to avoid clashes with multiple archives
759 $extract_dir = catdir($extract_dir, random_string('cccccc'));
760
Akron31a08cb2019-02-20 20:43:26 +0100761 # Extract to temporary directory
Akrona3518372024-01-22 23:29:00 +0100762 if ($archive->extract_all($q, $extract_dir, $sequential_extraction ? 1: $jobs)) {
763 print "Extract sequentially to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200764 @input = ($extract_dir);
765 }
766 else {
767 $log->error('Unable to extract from primary archive ' . $input[0] .
768 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200769 exit 1;
Akron81500102017-04-07 20:45:44 +0200770 };
771 }
772
773 # Can't create archive object
774 else {
775 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200776 exit 1;
Akron81500102017-04-07 20:45:44 +0200777 };
778 };
779
Akron7d4cdd82016-08-17 21:39:45 +0200780 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100781 my $pool = Parallel::ForkManager->new($jobs);
782
Akron7d4cdd82016-08-17 21:39:45 +0200783 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100784 my $iter = 1; # Current text in process
785
Akronda3097e2017-04-23 19:53:57 +0200786 my $tar_archive;
787 my $output_dir = $output;
788 my $tar_fh;
789
790 # Initialize tar archive
791 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200792
793 # Set output name
794 my $tar_file = $output;
795 unless ($tar_file =~ /\.tar$/) {
796 $tar_file .= '.tar';
797 };
798
799 # Initiate the tar file
Akrona3518372024-01-22 23:29:00 +0100800 print "Writing to file $tar_file\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200801 $tar_fh = IO::File->new($tar_file, 'w');
802 $tar_fh->binmode(1);
803
Akroneb370a02022-02-24 13:33:40 +0100804 # Use tar builder for archiving
805 if (eval("use Archive::Tar::Builder; 1;")) {
806 $tar_archive = Archive::Tar::Builder->new(
807 ignore_errors => 1
808 );
809
810 # Set handle
811 $tar_archive->set_handle($tar_fh);
812 }
813
814 # Fallback solution
815 else {
816 $tar_archive = KorAP::XML::TarBuilder->new(
817 $tar_fh
818 );
819 };
Akronda3097e2017-04-23 19:53:57 +0200820
821 # Output to temporary directory
822 $output_dir = File::Temp->newdir;
823 };
824
Akron941c1a62016-02-23 17:41:41 +0100825 # Report on fork message
826 $pool->run_on_finish (
827 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200828 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100829 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200830
Akrona3518372024-01-22 23:29:00 +0100831 unless ($q) {
832 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
833 $iter . "/$count]" .
834 ($code ? " $code" : '') .
835 ' ' . $data->[0] . "\n";
836 };
837 $iter++;
Akronda3097e2017-04-23 19:53:57 +0200838
839 if (!$code && $to_tar && $data->[2]) {
840 my $filename = $data->[2];
841
842 # Lock filehandle
843 if (flock($tar_fh, LOCK_EX)) {
844
Akron9a062ce2017-07-04 19:12:05 +0200845 my $clean_file = fileparse($filename);
846
Akronda3097e2017-04-23 19:53:57 +0200847 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200848 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200849 unlink $filename;
850
851 # Unlock filehandle
852 flock($tar_fh, LOCK_UN);
853 }
854 else {
855 $log->warn("Unable to add $filename to archive");
856 };
857 };
858
Akron4c0cf312016-10-15 16:42:09 +0200859 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100860 }
861 );
862
863 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200864 my $temp;
Akrona3518372024-01-22 23:29:00 +0100865 print "Reading data ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100866
Akron7d4cdd82016-08-17 21:39:45 +0200867 # unless (Cache::FastMmap->new(
868 # share_file => $cache_file,
869 # cache_size => $cache_size,
870 # init_file => $cache_init
871 # )) {
872 # print "Unable to intialize cache '$cache_file'\n\n";
873 # exit(1);
874 # };
Akron11c80302016-03-18 19:44:43 +0100875
Akron486f9ab2017-04-22 23:25:19 +0200876
Akron941c1a62016-02-23 17:41:41 +0100877 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100878 if (-d $input[0]) {
Akron941c1a62016-02-23 17:41:41 +0100879
Akronce033502024-09-11 10:51:49 +0200880 my @dirs;
881
Akron0a0d1f92024-11-14 14:31:42 +0100882 my $rule = Path::Iterator::Rule->new;
883 $rule->name('data.xml')->file;
884 my $next = $rule->iter(
885 $input[0] => {
886 sorted => 0,
887 depthfirst => -1,
888 error_handler => undef
889 });
890 while (defined(my $file = $next->())) {
891 $file =~ s/\/data\.xml$//;
892 push @dirs, $file;
893 };
Akron941c1a62016-02-23 17:41:41 +0100894
Akrona3518372024-01-22 23:29:00 +0100895 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100896 $t = Benchmark->new;
897 $count = scalar @dirs;
898
899 DIRECTORY_LOOP:
900 for (my $i = 0; $i < $count; $i++) {
901
Akrone1dbc382016-07-08 22:24:52 +0200902 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200903 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200904 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200905 );
Akron941c1a62016-02-23 17:41:41 +0100906
907 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200908 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200909
Akron13d56622016-10-31 14:54:49 +0100910 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200911 $pool->finish(
912 0,
Akronda3097e2017-04-23 19:53:57 +0200913 [
914 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
915 undef,
916 $filename
917 ]
Akron486f9ab2017-04-22 23:25:19 +0200918 );
Akron3ec48972016-08-17 23:24:52 +0200919 }
920 else {
Akron4c0cf312016-10-15 16:42:09 +0200921 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200922 };
Akron941c1a62016-02-23 17:41:41 +0100923 };
924 }
925
926 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200927 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200928
Akron941c1a62016-02-23 17:41:41 +0100929 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200930 $log->error("Unzip is not installed or incompatible.");
931 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100932 };
933
Akron08385f62016-03-22 20:37:04 +0100934 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200935 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100936
Akron31a08cb2019-02-20 20:43:26 +0100937 # Get sigles to extract
938 my $prefix = set_sigle($archive);
939
Akrona3518372024-01-22 23:29:00 +0100940 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100941 $t = Benchmark->new;
942 my @dirs = $archive->list_texts;
943 $count = scalar @dirs;
944
945 ARCHIVE_LOOP:
946 for (my $i = 0; $i < $count; $i++) {
947
948 # Split path information
949 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
950
Akrone1dbc382016-07-08 22:24:52 +0200951 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200952 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200953 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200954 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200955 catfile($corpus, $doc, $text)
956 . '.json' . ($gzip ? '.gz' : '')
957 )
Akrone1dbc382016-07-08 22:24:52 +0200958 );
Akron941c1a62016-02-23 17:41:41 +0100959
960 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200961 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100962
Akron4c0cf312016-10-15 16:42:09 +0200963 # Create temporary file
964 $temp = File::Temp->newdir;
965
Akronbdf434a2016-10-24 17:42:07 +0200966 # TODO: Check if $filename exist at the beginning,
967 # because extraction can be horrible slow!
968
Akron941c1a62016-02-23 17:41:41 +0100969 # Extract from archive
Akrona3518372024-01-22 23:29:00 +0100970 if ($archive->extract_sigle($q, [join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100971
Akron7d4cdd82016-08-17 21:39:45 +0200972 # Create corpus directory
973 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100974
Akron7d4cdd82016-08-17 21:39:45 +0200975 # Temporary directory
976 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100977
Akron7d4cdd82016-08-17 21:39:45 +0200978 # Write file
Akron13d56622016-10-31 14:54:49 +0100979 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200980
Akron4c0cf312016-10-15 16:42:09 +0200981 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100982 $pool->finish(
983 0,
Akronda3097e2017-04-23 19:53:57 +0200984 [
985 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
986 $temp,
987 $filename
988 ]
Akron13d56622016-10-31 14:54:49 +0100989 );
Akron7d4cdd82016-08-17 21:39:45 +0200990 }
991 else {
Akron4c0cf312016-10-15 16:42:09 +0200992 # Delete temporary file
993 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200994 };
Akron941c1a62016-02-23 17:41:41 +0100995 }
Akron7d4cdd82016-08-17 21:39:45 +0200996
997 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100998 else {
Akron4c0cf312016-10-15 16:42:09 +0200999 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001000 };
1001 };
1002 }
1003
1004 else {
Akrona3518372024-01-22 23:29:00 +01001005 print "Input is neither a directory nor an archive.\n\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +01001006 };
1007
1008 $pool->wait_all_children;
1009
Akronda3097e2017-04-23 19:53:57 +02001010 # Close tar filehandle
1011 if ($to_tar && $tar_fh) {
1012 $tar_archive->finish;
1013 $tar_fh->close;
Akrona3518372024-01-22 23:29:00 +01001014 print "Wrote to tar archive.\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +02001015 };
Akrona3518372024-01-22 23:29:00 +01001016 unless ($q) {
1017 print timestr(timediff(Benchmark->new, $t))."\n";
1018 print "Done.\n";
1019 };
Akron81500102017-04-07 20:45:44 +02001020};
Akron941c1a62016-02-23 17:41:41 +01001021
Nils Diewald2db9ad02013-10-29 19:26:43 +00001022
Akron31a08cb2019-02-20 20:43:26 +01001023# For an archive, this will create the list
1024# of all sigles to process
1025sub set_sigle {
1026 my $archive = shift;
1027
1028 my $prefix = 1;
1029 my @dirs = ();
1030
1031 # No sigles given
1032 unless (@sigle) {
1033
1034 # Get files
1035 foreach ($archive->list_texts) {
1036
1037 push @dirs, $_;
1038
1039 # Split path information
1040 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1041
1042 # TODO: Make this OS independent
1043 push @sigle, join '/', $corpus, $doc, $text;
1044 };
1045 }
1046
1047 # Check sigle for doc sigles
1048 else {
1049 my @new_sigle;
1050
1051 my $prefix_check = 0;
1052
1053 # Iterate over all sigle
1054 foreach (@sigle) {
1055
1056 # Sigle is a doc sigle
1057 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1058
Akrona3518372024-01-22 23:29:00 +01001059 print "$_ ..." unless $q;
Akron31a08cb2019-02-20 20:43:26 +01001060 # Check if a prefix is needed
1061 unless ($prefix_check) {
1062
Akrona3518372024-01-22 23:29:00 +01001063 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001064 print " with prefix ...";
1065 };
1066 $prefix_check = 1;
1067 };
1068
Akrona3518372024-01-22 23:29:00 +01001069 unless ($q) {
1070 print "\n";
Akron31a08cb2019-02-20 20:43:26 +01001071
Akrona3518372024-01-22 23:29:00 +01001072 print '... ' . (
1073 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
Akron955b75b2019-02-21 14:28:41 +01001074 ? '' : 'not '
Akrona3518372024-01-22 23:29:00 +01001075 );
1076 print "extracted.\n";
1077 }
1078 else {
1079 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
1080 };
Akron31a08cb2019-02-20 20:43:26 +01001081 }
1082
1083 # Sigle is a text sigle
1084 else {
1085 push @new_sigle, $_;
1086
1087 unless ($prefix_check) {
1088
Akrona3518372024-01-22 23:29:00 +01001089 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001090 print " with prefix ...";
1091 };
1092 $prefix_check = 1;
1093 };
1094 };
1095 };
1096 @sigle = @new_sigle;
1097 };
1098
1099 return $prefix;
1100};
1101
1102
Akron63f20d42017-04-10 23:40:29 +02001103# Cleanup temporary extraction directory
1104if ($extract_dir) {
1105 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001106 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001107};
1108
1109
1110print "\n";
1111
Nils Diewald2db9ad02013-10-29 19:26:43 +00001112__END__
Akron941c1a62016-02-23 17:41:41 +01001113
1114=pod
1115
1116=encoding utf8
1117
1118=head1 NAME
1119
Akron42f48c12020-02-14 13:08:13 +01001120korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001121
1122
1123=head1 SYNOPSIS
1124
Akron9cb8c982024-03-22 10:46:56 +01001125 $ korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001126
Akron2fd402b2016-10-27 21:26:48 +02001127
Akron941c1a62016-02-23 17:41:41 +01001128=head1 DESCRIPTION
1129
1130L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1131compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001132The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001133
1134
1135=head1 INSTALLATION
1136
1137The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1138
Akron9cb8c982024-03-22 10:46:56 +01001139 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001140
Akronc13a1702016-03-15 19:33:14 +01001141In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001142be available on your command line immediately.
Akron8ce23f72023-12-13 15:48:49 +01001143Minimum requirement for L<KorAP::XML::Krill> is Perl 5.32.
Akroneb370a02022-02-24 13:33:40 +01001144Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1145Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001146In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001147
1148=head1 ARGUMENTS
1149
Akron9cb8c982024-03-22 10:46:56 +01001150 $ korapxml2krill -z --input <directory> --output <filename>
Akrona76d8352016-10-27 16:27:32 +02001151
1152Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001153It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001154
Akron941c1a62016-02-23 17:41:41 +01001155=over 2
1156
1157=item B<archive>
1158
Akron9cb8c982024-03-22 10:46:56 +01001159 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001160
Akron2fd402b2016-10-27 21:26:48 +02001161Converts an archive of KorAP-XML documents. It expects a directory
1162(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001163
1164=item B<extract>
1165
Akron9cb8c982024-03-22 10:46:56 +01001166 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Akrona76d8352016-10-27 16:27:32 +02001167
1168Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001169
Akron63f20d42017-04-10 23:40:29 +02001170=item B<serial>
1171
Akron9cb8c982024-03-22 10:46:56 +01001172 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Akron63f20d42017-04-10 23:40:29 +02001173
Akronce033502024-09-11 10:51:49 +02001174Convert archives in serial. The inputs are not merged but treated
Akron63f20d42017-04-10 23:40:29 +02001175as they are (so they may be premerged or globs).
1176the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001177are created based on the archive name. In case the C<--to-tar> flag is given,
1178the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001179
1180
Akron84b53ad2022-01-14 12:39:15 +01001181=item B<slimlog>
1182
Akron9cb8c982024-03-22 10:46:56 +01001183 $ korapxml2krill slimlog <logfile> > <logfile-slim>
Akron84b53ad2022-01-14 12:39:15 +01001184
1185Filters out all useless aka succesfull information from logs, to simplify
1186log checks. Expects no further options.
1187
1188
Akron941c1a62016-02-23 17:41:41 +01001189=back
1190
1191
1192=head1 OPTIONS
1193
1194=over 2
1195
Akrona76d8352016-10-27 16:27:32 +02001196=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001197
Akrona76d8352016-10-27 16:27:32 +02001198Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001199
Akron7606afa2016-10-25 16:23:49 +02001200Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001201document, while C<archive> expects a KorAP-XML corpus folder or a zip
1202file to batch process multiple files.
1203C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001204
Akrondee3cf62024-06-14 18:14:48 +02001205C<archive> supports multiple input zip files with the constraint
Akron2cfe8092016-06-24 17:48:49 +02001206that the first archive listed contains all primary data files
1207and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001208
Akron7606afa2016-10-25 16:23:49 +02001209 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001210
Akron821db3d2017-04-06 21:19:31 +02001211Input may also be defined using BSD glob wildcards.
1212
1213 -i 'file/news*.zip'
1214
1215The extended input array will be sorted in length order, so the shortest
1216path needs to contain all primary data files and all meta data files.
1217
Akrondee3cf62024-06-14 18:14:48 +02001218(The directory structure follows the base directory format
Akron0c3e3752016-06-28 15:55:53 +02001219that may include a C<.> root folder.
1220In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001221need to be passed with a hash sign in front of the archive's name.
1222This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001223
Akron7606afa2016-10-25 16:23:49 +02001224To support zip files, a version of C<unzip> needs to be installed that is
1225compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001226
Akron7606afa2016-10-25 16:23:49 +02001227B<The root folder switch using the hash sign is experimental and
1228may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001229
Akronf73ffb62018-06-27 12:13:59 +02001230
Akron63f20d42017-04-10 23:40:29 +02001231=item B<--input-base|-ib> <directory>
1232
1233The base directory for inputs.
1234
1235
Akron941c1a62016-02-23 17:41:41 +01001236=item B<--output|-o> <directory|file>
1237
1238Output folder for archive processing or
1239document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001240writes to C<STDOUT> by default
1241(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001242
1243=item B<--overwrite|-w>
1244
1245Overwrite files that already exist.
1246
Akronf73ffb62018-06-27 12:13:59 +02001247
Akron3741f8b2016-12-21 19:55:21 +01001248=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001249
1250Define the default tokenization by specifying
1251the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001252of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001253This will directly take the file instead of running
1254the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001255
Akron3741f8b2016-12-21 19:55:21 +01001256
1257=item B<--base-sentences|-bs> <foundry>#<layer>
1258
1259Define the layer for base sentences.
1260If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001261Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1262layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001263
1264 Defaults to unset.
1265
1266
1267=item B<--base-paragraphs|-bp> <foundry>#<layer>
1268
1269Define the layer for base paragraphs.
1270If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001271Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1272layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001273
1274 Defaults to unset.
1275
1276
Akron41ac10b2017-02-08 22:47:25 +01001277=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1278
1279Define the layer for base pagebreaks.
1280Currently C<DeReKo#Structure> is the only layer supported.
1281
1282 Defaults to unset.
1283
1284
Akron941c1a62016-02-23 17:41:41 +01001285=item B<--skip|-s> <foundry>[#<layer>]
1286
Akronf7ad89e2016-03-16 18:22:47 +01001287Skip specific annotations by specifying the foundry
1288(and optionally the layer with a C<#>-prefix),
1289e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001290Can be set multiple times.
1291
Akronf73ffb62018-06-27 12:13:59 +02001292
Akronc13a1702016-03-15 19:33:14 +01001293=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001294
Akronf7ad89e2016-03-16 18:22:47 +01001295Convert specific annotations by specifying the foundry
1296(and optionally the layer with a C<#>-prefix),
1297e.g. C<Mate> or C<Mate#Morpho>.
1298Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001299
Akronf73ffb62018-06-27 12:13:59 +02001300
Akroned9baf02019-01-22 17:03:25 +01001301=item B<--non-word-tokens|-nwt>
1302
1303Tokenize non-word tokens like word tokens (defined as matching
1304C</[\d\w]/>). Useful to treat punctuations as tokens.
1305
1306 Defaults to unset.
1307
Akronf1849aa2019-12-16 23:35:33 +01001308
1309=item B<--non-verbal-tokens|-nvt>
1310
1311Tokenize non-verbal tokens marked as in the primary data as
1312the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1313
1314 Defaults to unset.
1315
1316
Akron941c1a62016-02-23 17:41:41 +01001317=item B<--jobs|-j>
1318
Akron29128262024-04-17 15:50:36 +02001319Define the number of spawned forks for concurrent jobs
1320of archive processing.
Akron11c80302016-03-18 19:44:43 +01001321Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001322
Akrona472a242023-02-13 13:46:30 +01001323If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001324also apply to extraction.
1325
Akronebbac2e2024-03-22 10:31:23 +01001326Pass C<-1>, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001327times the number of available cores, in case L<Sys::Info>
Akronebbac2e2024-03-22 10:31:23 +01001328is available and can read CPU count (see C<--job-count>).
1329Be aware, that the report of available cores
Akron29128262024-04-17 15:50:36 +02001330may not work in certain conditions. Benchmarking the processing
1331speed based on the number of jobs may be valuable.
Akronebbac2e2024-03-22 10:31:23 +01001332
Akronf7ad89e2016-03-16 18:22:47 +01001333This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001334
Akronf73ffb62018-06-27 12:13:59 +02001335
Akronebbac2e2024-03-22 10:31:23 +01001336=item B<--job-count|-jc>
1337
1338Print job and core information that would be used if
1339C<-1> was passed to C<--jobs>.
1340
1341
Akron263274c2019-02-07 09:48:30 +01001342=item B<--koral|-k>
1343
1344Version of the output format. Supported versions are:
1345C<0> for legacy serialization, C<0.03> for serialization
1346with metadata fields as key-values on the root object,
1347C<0.4> for serialization with metadata fields as a list
1348of C<"@type":"koral:field"> objects.
1349
1350Currently defaults to C<0.03>.
1351
1352
Akron9ec88872017-04-12 16:29:06 +02001353=item B<--sequential-extraction|-se>
1354
1355Flag to indicate, if the C<jobs> value also applies to extraction.
1356Some systems may have problems with extracting multiple archives
1357to the same folder at the same time.
1358Can be flagged using C<--no-sequential-extraction> as well.
1359Defaults to C<false>.
1360
Akronf73ffb62018-06-27 12:13:59 +02001361
Akron35db6e32016-03-17 22:42:22 +01001362=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001363
Akron35db6e32016-03-17 22:42:22 +01001364Define the metadata parser to use. Defaults to C<I5>.
1365Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1366This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001367
Akronf73ffb62018-06-27 12:13:59 +02001368
Akron941c1a62016-02-23 17:41:41 +01001369=item B<--gzip|-z>
1370
Akronf7ad89e2016-03-16 18:22:47 +01001371Compress the output.
1372Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001373
Akronf73ffb62018-06-27 12:13:59 +02001374
Akron11c80302016-03-18 19:44:43 +01001375=item B<--cache|-c>
1376
1377File to mmap a cache (using L<Cache::FastMmap>).
1378Defaults to C<korapxml2krill.cache> in the calling directory.
1379
Akronf73ffb62018-06-27 12:13:59 +02001380
Akron11c80302016-03-18 19:44:43 +01001381=item B<--cache-size|-cs>
1382
1383Size of the cache. Defaults to C<50m>.
1384
Akronf73ffb62018-06-27 12:13:59 +02001385
Akron11c80302016-03-18 19:44:43 +01001386=item B<--cache-init|-ci>
1387
1388Initialize cache file.
1389Can be flagged using C<--no-cache-init> as well.
1390Defaults to C<true>.
1391
Akronf73ffb62018-06-27 12:13:59 +02001392
Akron11c80302016-03-18 19:44:43 +01001393=item B<--cache-delete|-cd>
1394
1395Delete cache file after processing.
1396Can be flagged using C<--no-cache-delete> as well.
1397Defaults to C<true>.
1398
Akronf73ffb62018-06-27 12:13:59 +02001399
Akron636aa112017-04-07 18:48:56 +02001400=item B<--config|-cfg>
1401
1402Configure the parameters of your call in a file
1403of key-value pairs with whitespace separator
1404
1405 overwrite 1
1406 token DeReKo#Structure
1407 ...
1408
1409Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001410C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akronc0ac4ff2024-04-15 18:03:15 +02001411C<token>, C<log>,
1412C<cache>, C<cache-size>, C<cache-init>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001413C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001414C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001415C<base-sentences>, C<base-paragraphs>,
1416C<base-pagebreaks>,
1417C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001418(semicolon separated), C<anno> (semicolon separated).
1419
Akronf73ffb62018-06-27 12:13:59 +02001420Configuration parameters will always be overwritten by
1421passed parameters.
1422
1423
Akron81500102017-04-07 20:45:44 +02001424=item B<--temporary-extract|-te>
1425
Akrona472a242023-02-13 13:46:30 +01001426Only valid for the C<archive> and C<serial>
1427commands.
Akron81500102017-04-07 20:45:44 +02001428
1429This will first extract all files into a
1430directory and then will archive.
1431If the directory is given as C<:temp:>,
1432a temporary directory is used.
1433This is especially useful to avoid
1434massive unzipping and potential
1435network latency.
Akron636aa112017-04-07 18:48:56 +02001436
Akronf73ffb62018-06-27 12:13:59 +02001437
Akronc93a0802019-07-11 15:48:34 +02001438=item B<--to-tar>
1439
1440Only valid for the C<archive> command.
1441
1442Writes the output into a tar archive.
1443
1444
Akrone10ad322016-02-27 10:54:26 +01001445=item B<--sigle|-sg>
1446
Akron20807582016-10-26 17:11:34 +02001447Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001448Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001449I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001450Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001451In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001452On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001453
Akron64f7fae2022-07-27 12:45:33 +02001454=item B<--lang>
1455
1456Preferred language for metadata fields. In case multiple titles are
1457given (on any level) with different C<xml:lang> attributes,
1458the language given is preferred.
1459Because titles may have different sources and different priorities,
1460non-specific language titles may still be preferred in case the title
1461source has a higher priority.
1462
Akronf73ffb62018-06-27 12:13:59 +02001463
Akron941c1a62016-02-23 17:41:41 +01001464=item B<--log|-l>
1465
Akronb9c33812020-10-21 16:19:35 +02001466The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001467
Akronf73ffb62018-06-27 12:13:59 +02001468
Akrona3518372024-01-22 23:29:00 +01001469=item B<--quiet>
1470
1471Silence all information (non-log) outputs.
1472
1473
Akron941c1a62016-02-23 17:41:41 +01001474=item B<--help|-h>
1475
Akron42f48c12020-02-14 13:08:13 +01001476Print help information.
Akron941c1a62016-02-23 17:41:41 +01001477
Akronf73ffb62018-06-27 12:13:59 +02001478
Akron941c1a62016-02-23 17:41:41 +01001479=item B<--version|-v>
1480
1481Print version information.
1482
1483=back
1484
Akron311e29b2024-09-11 11:46:09 +02001485=head1 PERFORMANCE
1486
1487There are some ways to improve performance for large tasks:
1488
1489=item First unpack
1490
1491Using the archive or serial command on one or multiple zip files
1492can be very slow, as it needs to unpack small portions every time.
1493It's better to use C<--temporary-extract> to unpack the whole archive
1494first into a temprary directory and then read the extracted files.
1495This is especially important for remote archives
1496
1497=item Limit annotations
1498
1499Per default, all supported annotation layers are sought. This can be limited
1500by adding C<--skip '#ALL'> and only listing the expected annotations with C<--anno>.
1501
1502=item Checking the parallel job count
1503
1504By providing the number of parallel jobs using C<--jobs>, the execution can be tailored to specific
1505hardware environments.
1506
Akronc13a1702016-03-15 19:33:14 +01001507=head1 ANNOTATION SUPPORT
1508
1509L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1510developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1511The base foundry with paragraphs, sentences, and the text element are mandatory for
1512L<Krill|https://github.com/KorAP/Krill>.
1513
Akron821db3d2017-04-06 21:19:31 +02001514 Base
1515 #Paragraphs
1516 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001517
Akron821db3d2017-04-06 21:19:31 +02001518 Connexor
1519 #Morpho
1520 #Phrase
1521 #Sentences
1522 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001523
Akron821db3d2017-04-06 21:19:31 +02001524 CoreNLP
1525 #Constituency
1526 #Morpho
1527 #NamedEntities
1528 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001529
Akron5530a552022-02-17 17:53:15 +01001530 CorpusExplorer
1531 #Morpho
1532
Akronce125b62017-06-19 11:54:36 +02001533 CMC
1534 #Morpho
1535
Akron821db3d2017-04-06 21:19:31 +02001536 DeReKo
1537 #Structure
Akronc13a1702016-03-15 19:33:14 +01001538
Akron57510c12019-01-04 14:58:53 +01001539 DGD
1540 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001541 #Structure
Akron57510c12019-01-04 14:58:53 +01001542
Akron821db3d2017-04-06 21:19:31 +02001543 DRuKoLa
1544 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001545
Akron821db3d2017-04-06 21:19:31 +02001546 Glemm
1547 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001548
Akronabb36902021-10-11 15:51:06 +02001549 Gingko
1550 #Morpho
1551
Akronea1aed52018-07-19 14:43:34 +02001552 HNC
1553 #Morpho
1554
Akron4c679192018-01-16 17:41:49 +01001555 LWC
1556 #Dependency
1557
Akron821db3d2017-04-06 21:19:31 +02001558 Malt
1559 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001560
Akron821db3d2017-04-06 21:19:31 +02001561 MarMoT
1562 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001563
Akron821db3d2017-04-06 21:19:31 +02001564 Mate
1565 #Dependency
1566 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001567
Akron821db3d2017-04-06 21:19:31 +02001568 MDParser
1569 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001570
Akrone85a7762022-07-22 08:05:03 +02001571 NKJP
1572 #Morpho
1573 #NamedEntities
1574
Akron821db3d2017-04-06 21:19:31 +02001575 OpenNLP
1576 #Morpho
1577 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001578
Akron07e24772020-04-23 14:00:54 +02001579 RWK
1580 #Morpho
1581 #Structure
1582
Akron821db3d2017-04-06 21:19:31 +02001583 Sgbr
1584 #Lemma
1585 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001586
Marc Kupietzb8c53822024-03-16 18:54:08 +01001587 Spacy
1588 #Morpho
1589
Akron7d5e6382019-08-08 16:36:27 +02001590 Talismane
1591 #Dependency
1592 #Morpho
1593
Akron821db3d2017-04-06 21:19:31 +02001594 TreeTagger
1595 #Morpho
1596 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001597
Akron83aedd32023-02-07 10:57:41 +01001598 UDPipe
1599 #Dependency
1600 #Morpho
1601
Akron821db3d2017-04-06 21:19:31 +02001602 XIP
1603 #Constituency
1604 #Morpho
1605 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001606
Akronc13a1702016-03-15 19:33:14 +01001607
1608More importers are in preparation.
1609New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1610See the built-in annotation importers as examples.
1611
Akronf73ffb62018-06-27 12:13:59 +02001612
Akron41e6c8b2021-10-14 20:22:18 +02001613=head1 METADATA SUPPORT
1614
1615L<KorAP::XML::Krill> has built-in importer for some meta data variants
Akron4b001ce2024-06-06 12:32:11 +02001616that are part of the KorAP preprocessing pipeline.
Akron41e6c8b2021-10-14 20:22:18 +02001617
1618=over 2
1619
Akron1d101492024-06-06 12:47:35 +02001620=item B<I5>
Akron41e6c8b2021-10-14 20:22:18 +02001621
Akron1d101492024-06-06 12:47:35 +02001622Meta data for all I5 files
Akron41e6c8b2021-10-14 20:22:18 +02001623
Akron1d101492024-06-06 12:47:35 +02001624=item B<Sgbr>
Akron41e6c8b2021-10-14 20:22:18 +02001625
Akron1d101492024-06-06 12:47:35 +02001626Meta data from the Schreibgebrauch project
Akron2532f1b2023-05-15 13:41:24 +02001627
Akron1d101492024-06-06 12:47:35 +02001628=item B<Gingko>
1629
1630Meta data from the Gingko project in addition to I5
1631
1632=item B<ICC>
1633
1634Meta data for the ICC in addition to I5
1635
1636=item B<NKJP>
1637
1638Meta data for the NKJP corpora
Akron24ad3c02024-06-03 12:38:20 +02001639
Akron41e6c8b2021-10-14 20:22:18 +02001640=back
1641
Akron41e6c8b2021-10-14 20:22:18 +02001642New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1643See the built-in meta data importers as examples.
1644
Akron4b001ce2024-06-06 12:32:11 +02001645The I5 metadata definition is based on TEI-P5 and supports C<E<lt>xenoDataE<gt>>
Akron82064bb2024-06-17 12:53:23 +02001646with C<E<lt>metaE<gt>> elements like
Akron4b001ce2024-06-06 12:32:11 +02001647
1648 <meta type="..." name="..." project="..." desc="...">...</meta>
1649
1650that are directly translated to Krill objects. The supported values are:
1651
1652=over 2
1653
Akron1d101492024-06-06 12:47:35 +02001654=item C<type>
Akron4b001ce2024-06-06 12:32:11 +02001655
1656=over 4
1657
Akron1d101492024-06-06 12:47:35 +02001658=item C<string>
Akron4b001ce2024-06-06 12:32:11 +02001659
Akron1d101492024-06-06 12:47:35 +02001660String meta data value
Akron4b001ce2024-06-06 12:32:11 +02001661
Akron1d101492024-06-06 12:47:35 +02001662=item C<keyword>
Akron4b001ce2024-06-06 12:32:11 +02001663
Akrondee3cf62024-06-14 18:14:48 +02001664String meta data value that can be given multiple times
Akron4b001ce2024-06-06 12:32:11 +02001665
Akron1d101492024-06-06 12:47:35 +02001666=item C<text>
Akron4b001ce2024-06-06 12:32:11 +02001667
Akrondee3cf62024-06-14 18:14:48 +02001668String meta data value that is tokenized and can be searched as token sequences
Akron4b001ce2024-06-06 12:32:11 +02001669
Akron1d101492024-06-06 12:47:35 +02001670=item C<date>
1671
1672Date meta data value (as "yyyy/mm/dd" with optional granularity)
1673
1674=item C<integer>
1675
1676Numerical meta data value
1677
Akrondee3cf62024-06-14 18:14:48 +02001678=item C<attachment>
Akron1d101492024-06-06 12:47:35 +02001679
1680Non-indexed meta data value (only retrievable)
1681
1682=item C<uri>
1683
1684Non-indexed attached URI, takes the desc as the title for links
Akron4b001ce2024-06-06 12:32:11 +02001685
1686=back
1687
Akron1d101492024-06-06 12:47:35 +02001688=item C<name>
Akron4b001ce2024-06-06 12:32:11 +02001689
Akrondee3cf62024-06-14 18:14:48 +02001690The key of the meta object that may be prefixed by C<corpus> or C<doc>, in case the
Akron693f5882024-06-06 12:52:39 +02001691C<E<lt>xenoDataE<gt>> information is located on these levels. The text level introduces
1692no prefixes.
Akron4b001ce2024-06-06 12:32:11 +02001693
Akron1d101492024-06-06 12:47:35 +02001694=item C<project> (optional)
Akron4b001ce2024-06-06 12:32:11 +02001695
Akron1d101492024-06-06 12:47:35 +02001696A prefixed namespace of the key
1697
1698=item C<desc> (optional)
1699
1700A description of the key
1701
1702=item text content
1703
1704The value of the meta object
Akron4b001ce2024-06-06 12:32:11 +02001705
1706=back
1707
Akron41e6c8b2021-10-14 20:22:18 +02001708
Akron8f69d632020-01-15 16:58:11 +01001709=head1 About KorAP-XML
1710
1711KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1712data model (Bański et al. 2013), where text data are stored physically
1713separated from their interpretations (i.e. annotations).
1714A text document in KorAP-XML therefore consists of several files
1715containing primary data, metadata and annotations.
1716
1717The structure of a single KorAP-XML document can be as follows:
1718
1719 - data.xml
1720 - header.xml
1721 + base
1722 - tokens.xml
1723 - ...
1724 + struct
1725 - structure.xml
1726 - ...
1727 + corenlp
1728 - morpho.xml
1729 - constituency.xml
1730 - ...
1731 + tree_tagger
1732 - morpho.xml
1733 - ...
1734 - ...
1735
1736The C<data.xml> contains the primary data, the C<header.xml> contains
1737the metadata, and the annotation layers are stored in subfolders
1738like C<base>, C<struct> or C<corenlp>
1739(so-called "foundries"; Bański et al. 2013).
1740
1741Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001742(Lüngen and Sperberg-McQueen 2012). See the documentation in
1743L<KorAP::XML::Meta::I5> for translatable fields.
1744
1745Annotations correspond to a variant of the TEI-P5 feature structures
1746(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001747Annotation feature structures refer to character sequences of the primary text
1748inside the C<text> element of the C<data.xml>.
1749A single annotation containing the lemma of a token can have the following structure:
1750
1751 <span from="0" to="3">
1752 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1753 <f name="lex">
1754 <fs>
1755 <f name="lemma">zum</f>
1756 </fs>
1757 </f>
1758 </fs>
1759 </span>
1760
1761The C<from> and C<to> attributes are refering to the character span
1762in the primary text.
1763Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1764the structure may vary. See L<KorAP::XML::Annotation::*> for various
1765annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001766
1767Multiple KorAP-XML documents are organized on three levels following
1768the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1769corpus E<gt> document E<gt> text. On each level metadata information
1770can be stored, that C<korapxml2krill> will merge to a single metadata
1771object per text. A corpus is therefore structured as follows:
1772
1773 + <corpus>
1774 - header.xml
1775 + <document>
1776 - header.xml
1777 + <text>
1778 - data.xml
1779 - header.xml
1780 - ...
1781 - ...
1782
1783A single text can be identified by the concatenation of
1784the corpus identifier, the document identifier and the text identifier.
1785This identifier is called the text sigle
1786(e.g. a text with the identifier C<18486> in the document C<060> in the
1787corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1788
1789These corpora are often stored in zip files, with which C<korapxml2krill>
1790can deal with. Corpora may also be split in multiple zip archives
1791(e.g. one zip file per foundry), which is also supported (see C<--input>).
1792
1793Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1794in form of a test suite.
1795The resulting JSON format merges all annotation layers
1796based on a single token stream.
1797
1798=head2 References
1799
1800Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1801KorAP data model: first approximation, December.
1802
1803Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1804"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1805Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1806L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1807
1808Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1809"Robust corpus architecture: a new look at virtual collections and data access",
1810Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1811L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1812
1813Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1814Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1815"Towards an international standard on featurestructure representation",
1816Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1817pp. 373-376.
1818L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1819
1820Harald Lüngen and C. M. Sperberg-McQueen (2012):
1821"A TEI P5 Document Grammar for the IDS Text Model",
1822Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1823L<PDF|https://journals.openedition.org/jtei/pdf/508>
1824
1825TEI Consortium, eds:
1826"Feature Structures",
1827Guidelines for Electronic Text Encoding and Interchange.
1828L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1829
Akron941c1a62016-02-23 17:41:41 +01001830=head1 AVAILABILITY
1831
1832 https://github.com/KorAP/KorAP-XML-Krill
1833
1834
1835=head1 COPYRIGHT AND LICENSE
1836
Akrona3518372024-01-22 23:29:00 +01001837Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001838
Akron6882d7d2021-02-08 09:43:57 +01001839Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001840
Akron29128262024-04-17 15:50:36 +02001841Contributor: Eliza Margaretha, Marc Kupietz
Akron941c1a62016-02-23 17:41:41 +01001842
Akron6882d7d2021-02-08 09:43:57 +01001843L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001844Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001845L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001846member of the
Akronf1849aa2019-12-16 23:35:33 +01001847L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001848
1849This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001850L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001851
1852=cut