blob: 132e8c6863522ea01d8e2db3c5c16811d887a161 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Marc Kupietzb8c53822024-03-16 18:54:08 +0100174# 2024/03/20
175# - Added Spacy support.
176#
Akronebbac2e2024-03-22 10:31:23 +0100177# 2024/03/22
178# - Improve core count logging.
Akron941c1a62016-02-23 17:41:41 +0100179# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100180
Akron24ad3c02024-06-03 12:38:20 +0200181our $LAST_CHANGE = '2024/06/04';
Akron941c1a62016-02-23 17:41:41 +0100182our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100183our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100184our $VERSION_MSG = <<"VERSION";
185Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
186VERSION
187
Akron941c1a62016-02-23 17:41:41 +0100188# Parse comand
189my $cmd;
190our @ARGV;
191if ($ARGV[0] && index($ARGV[0], '-') != 0) {
192 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100193};
Akron63f20d42017-04-10 23:40:29 +0200194my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100195
Akron5f51d422016-08-16 16:26:43 +0200196my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200197
198# Configuration hash
199my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100200
Akronebbac2e2024-03-22 10:31:23 +0100201# Count jobs/cores if not set
202sub count_jobs {
203 my ($cores, $jobs);
204 my $msg = 'Unable to determine number of cores - set to 1';
205 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
206 $cores = Sys::Info->new->device('CPU')->count;
207 if ($cores <= 0) {
208 $log->error($msg);
209 $cores = 1;
210 }
211 }
212 else {
213 $log->error($msg);
214 $cores = 1;
215 };
216
217 $jobs = ceil(5 * $cores);
218 return $jobs, "Run using $jobs jobs on $cores cores";
219}
220
Akron941c1a62016-02-23 17:41:41 +0100221# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000222GetOptions(
Akron08385f62016-03-22 20:37:04 +0100223 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200224 'input-base|ib=s' => \($cfg{input_base}),
225 'output|o=s' => \($cfg{output}),
226 'overwrite|w' => \($cfg{overwrite}),
227 'meta|m=s' => \($cfg{meta}),
228 'token|t=s' => \($cfg{token}),
229 'base-sentences|bs=s' => \($cfg{base_sentences}),
230 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
231 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
232 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100233 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100234 'skip|s=s' => \@skip,
235 'sigle|sg=s' => \@sigle,
Akronc0ac4ff2024-04-15 18:03:15 +0200236 'cache|c=s' => \($cfg{cache}),
Akron636aa112017-04-07 18:48:56 +0200237 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200238 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200239 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200240 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200241 'primary|p!' => sub {
242 warn 'Primary flag no longer supported!';
243 },
Akrona3518372024-01-22 23:29:00 +0100244 'quiet' => \($cfg{quiet}),
Akron6aed0562020-08-07 16:46:00 +0200245 'pretty|y' => sub {
246 warn 'Pretty flag no longer supported!';
247 },
Akronf8df2162020-08-07 15:03:39 +0200248 'jobs|j=i' => \($cfg{jobs}),
249 'koral|k=f' => \($cfg{koral}),
250 'to-tar' => \($cfg{to_tar}),
251 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
252 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
253 'sequential-extraction|se' => \($cfg{sequential_extraction}),
254 'cache-size|cs=s' => \($cfg{cache_size}),
255 'cache-delete|cd!' => \($cfg{cache_delete}),
256 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100257 'help|h' => sub {
258 pod2usage(
259 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200260 -verbose => 99,
261 -msg => $VERSION_MSG,
262 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100263 );
264 },
265 'version|v' => sub {
266 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200267 -verbose => 0,
268 -msg => $VERSION_MSG,
269 -output => '-'
Akronebbac2e2024-03-22 10:31:23 +0100270 ),
271 },
272 'job-count|jc' => sub {
273 my ($j, $msg) = count_jobs();
274 pod2usage(
275 -verbose => 0,
276 -msg => $msg,
277 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100278 )
279 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000280);
281
Akrone512b7c2020-08-07 16:16:12 +0200282my %ERROR_HASH = (
283 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
284 -verbose => 99,
285 -msg => $VERSION_MSG,
286 -output => '-',
287 -exit => 1
288);
Akron63f20d42017-04-10 23:40:29 +0200289
Akronf8df2162020-08-07 15:03:39 +0200290# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200291if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200292 my %config;
293
Akronf8df2162020-08-07 15:03:39 +0200294 print "Reading config from $cfg_file\n";
295
Akron636aa112017-04-07 18:48:56 +0200296 Config::Simple->import_from($cfg_file, \%config);
297
Akronf8df2162020-08-07 15:03:39 +0200298 foreach (qw!output cache-size input-base token overwrite
299 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200300 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100301 non-verbal-tokens sequential-extraction
Akronc0ac4ff2024-04-15 18:03:15 +0200302 temporary-extract cache-init cache-delete
Akrona3518372024-01-22 23:29:00 +0100303 koral extract-dir jobs quiet!) {
Akronf8df2162020-08-07 15:03:39 +0200304 my $underlined = $_ =~ tr/-/_/r;
305 if (!defined($cfg{$underlined}) && defined $config{$_}) {
306 $cfg{$underlined} = $config{$_};
307 };
Akron636aa112017-04-07 18:48:56 +0200308 };
309
310 # Skip
311 if (!scalar(@skip) && defined $config{'skip'}) {
312 @skip = split /\s*;\s*/, $config{'skip'} ;
313 };
314
315 # Sigle
316 if (!scalar(@sigle) && defined $config{'sigle'}) {
317 @sigle = split /\s*;\s*/, $config{'sigle'} ;
318 };
319
320 # Anno
321 if (!scalar(@anno) && defined $config{'anno'}) {
322 @anno = split /\s*;\s*/, $config{'anno'} ;
323 };
324};
325
Akronf8df2162020-08-07 15:03:39 +0200326# Init variables and set default values
327my $output = $cfg{output};
328my $input_base = $cfg{input_base};
329my $gzip = $cfg{gzip};
330my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100331my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200332my $token_base = $cfg{token} // 'OpenNLP#tokens';
333my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
334my $jobs = $cfg{jobs} // 0;
335my $cache_delete = $cfg{cache_delete} // 1;
336my $base_sentences = lc($cfg{base_sentences} // '');
337my $base_paragraphs = lc($cfg{base_paragraphs} // '');
338my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
339my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akrona3518372024-01-22 23:29:00 +0100340my $q = !!($cfg{quiet}) // 0;
Akron63f20d42017-04-10 23:40:29 +0200341
Akronf8df2162020-08-07 15:03:39 +0200342# Get tokenization basis
343my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200344
Akronf8df2162020-08-07 15:03:39 +0200345# Remove file extension
346$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100347
Akronf8df2162020-08-07 15:03:39 +0200348# Convert sigle to path construct
349s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
350
351my %skip;
352$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200353
Akronb9c33812020-10-21 16:19:35 +0200354Log::Any::Adapter->set(
355 'Stderr', log_level => uc($cfg{log} // 'ERROR')
356);
Akron63f20d42017-04-10 23:40:29 +0200357
Akron84b53ad2022-01-14 12:39:15 +0100358# Start log slimming
359if ($cmd && $cmd eq 'slimlog') {
360 require KorAP::XML::Log::Slim;
361
362 my $log_file = shift @ARGV;
363
364 if (-e $log_file) {
365
366 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
367
368 # Run log filter
369 $slimmer->slim_to;
370 }
371
372 else {
373 warn "Log file can't be found";
374 exit(1);
375 };
376
377 exit;
378};
379
380
Akronf8df2162020-08-07 15:03:39 +0200381if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
382 $log->error("Directory '$output' does not exist.");
383 exit 1;
384};
Akron63f20d42017-04-10 23:40:29 +0200385
Akron941c1a62016-02-23 17:41:41 +0100386# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100387pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000388
Akrone1dbc382016-07-08 22:24:52 +0200389# Gzip has no effect, if no output is given
390pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000391
Akron63f20d42017-04-10 23:40:29 +0200392# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200393if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200394
Akron63f20d42017-04-10 23:40:29 +0200395 # Remove all inputs
396 my $remove_next = 0;
397 @keep_argv = @{c(@keep_argv)->grep(
398 sub {
399 # Input flag
400 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
401 $remove_next = 1;
402 return 0;
403 }
404
405 # input value
406 elsif ($remove_next) {
407 $remove_next = 0;
408 return 0;
409 };
410
411 # Pass parameter
412 return 1;
413 }
414 )->to_array};
415
416
417 # Iterate over all inputs
418 foreach (@input) {
419
Akron081639e2017-04-21 19:01:39 +0200420 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200421 my $new_out = catdir($output, get_file_name_from_glob($_));
422
Akron486f9ab2017-04-22 23:25:19 +0200423 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200424 unless ($to_tar) {
425 if (make_path($new_out) == 0 && !-d $new_out) {
426 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200427 exit 1;
Akron081639e2017-04-21 19:01:39 +0200428 };
Akron63f20d42017-04-10 23:40:29 +0200429 };
430
431 # Create archive command
432 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
Akrona3518372024-01-22 23:29:00 +0100433 unless ($q) {
434 print "Start serial processing of $_ to $new_out\n";
435 print 'Command: ', join(' ', @archive_cmd), "\n";
436 };
Akron63f20d42017-04-10 23:40:29 +0200437
438 # Start archiving
439 system @archive_cmd;
440 };
441
Akron3abc03e2017-06-29 16:23:35 +0200442 exit;
Akron63f20d42017-04-10 23:40:29 +0200443};
444
Akron5c602cb2020-08-07 17:00:52 +0200445# Define supported (and preinstalled) transformation modules
446my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100447push(@layers, ['Base', 'Sentences']) unless $base_sentences;
448push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200449
450# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200451push(@layers, ['Connexor', 'Morpho'],
452 ['Connexor', 'Syntax'],
453 ['Connexor', 'Phrase'],
454 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200455
456# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200457push(@layers,
458 ['CoreNLP', 'NamedEntities'],
459 ['CoreNLP', 'Sentences'],
460 ['CoreNLP', 'Morpho'],
461 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200462
Akronce125b62017-06-19 11:54:36 +0200463# CMC
464push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100465
Akrone1dbc382016-07-08 22:24:52 +0200466# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100467my @dereko_attr = ();
468if ($base_sentences eq 'dereko#structure') {
469 push @dereko_attr, 'sentences';
470};
471if ($base_paragraphs eq 'dereko#structure') {
472 push @dereko_attr, 'paragraphs';
473};
Akron636bd9c2017-02-09 17:13:00 +0100474
Akron41ac10b2017-02-08 22:47:25 +0100475if ($base_pagebreaks eq 'dereko#structure') {
476 push @dereko_attr, 'pagebreaks';
477};
478
479if ($dereko_attr[0]) {
480 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100481}
482else {
483 push(@layers, ['DeReKo', 'Structure']);
484};
Akrone1dbc382016-07-08 22:24:52 +0200485
Akron57510c12019-01-04 14:58:53 +0100486# DGD
487push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100488if ($base_sentences eq 'dgd#structure') {
489 push(@layers, ['DGD', 'Structure', 'base-sentence']);
490}
Akron57510c12019-01-04 14:58:53 +0100491
492# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200493push(@layers,
494 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100495
Akronabb36902021-10-11 15:51:06 +0200496# Gingko
497push(@layers,
498 ['Gingko', 'Morpho']);
499
Akrone1dbc382016-07-08 22:24:52 +0200500# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200501push(@layers,
502 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200503
Akronea1aed52018-07-19 14:43:34 +0200504# HNC
Akron5c602cb2020-08-07 17:00:52 +0200505push(@layers,
506 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200507
Akron4c679192018-01-16 17:41:49 +0100508# LWC
Akron5c602cb2020-08-07 17:00:52 +0200509push(@layers,
510 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100511
Akrone1dbc382016-07-08 22:24:52 +0200512# Malt
Akron5c602cb2020-08-07 17:00:52 +0200513push(@layers,
514 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200515
Akron57510c12019-01-04 14:58:53 +0100516# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200517push(@layers,
518 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200519
520# Mate
Akron5c602cb2020-08-07 17:00:52 +0200521push(@layers,
522 ['Mate', 'Morpho'],
523 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200524
Akron57510c12019-01-04 14:58:53 +0100525# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200526push(@layers,
527 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100528
Akron88d063a2022-03-21 15:10:01 +0100529# NKJP
530push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200531 ['NKJP', 'Morpho'],
532 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100533
Akrone1dbc382016-07-08 22:24:52 +0200534# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200535push(@layers,
536 ['OpenNLP', 'Morpho'],
537 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200538
Akron07e24772020-04-23 14:00:54 +0200539# Redewiedergabe
540push(@layers, ['RWK', 'Morpho']);
541if ($base_sentences eq 'rwk#structure') {
542 push(@layers, ['RWK', 'Structure']);
543};
544
Akrone1dbc382016-07-08 22:24:52 +0200545# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200546push(@layers,
547 ['Sgbr', 'Lemma'],
548 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200549
Marc Kupietzb8c53822024-03-16 18:54:08 +0100550# Spacy
551push(@layers,
552 ['Spacy', 'Morpho']);
553
Akron7d5e6382019-08-08 16:36:27 +0200554# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200555push(@layers,
556 ['Talismane', 'Dependency'],
557 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200558
Akrone1dbc382016-07-08 22:24:52 +0200559# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200560push(@layers,
561 ['TreeTagger', 'Morpho'],
562 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200563
Marc Kupietz400590b2022-12-23 16:02:36 +0100564# UDPipe
565push(@layers,
566 ['UDPipe', 'Morpho'],
567 ['UDPipe', 'Dependency']);
568
Akrone1dbc382016-07-08 22:24:52 +0200569# XIP
Akron5c602cb2020-08-07 17:00:52 +0200570push(@layers,
571 ['XIP', 'Morpho'],
572 ['XIP', 'Constituency'],
573 ['XIP', 'Sentences'],
574 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200575
Akron4fa37c32017-01-20 14:43:10 +0100576
Akrone1dbc382016-07-08 22:24:52 +0200577# Check filters
578my @filtered_anno;
579if ($skip{'#all'}) {
580 foreach (@anno) {
581 push @filtered_anno, [ split('#', $_) ];
582 };
583}
584
585# Add all annotations that are not skipped
586else {
587 # Add to index file - respect skipping
588 foreach my $info (@layers) {
589 # Skip if Foundry or Foundry#Layer should be skipped
590 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
591 push @filtered_anno, $info;
592 };
593 };
594};
595
Akrone1dbc382016-07-08 22:24:52 +0200596
597# TODO: This should not be initialized for batch
598my $cache = Cache::FastMmap->new(
599 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200600 cache_size => ($cfg{cache_size} // '50m'),
Akronc0ac4ff2024-04-15 18:03:15 +0200601 init_file => ($cfg{cache_init} // 1),
602 unlink_on_exit => $cache_delete
Akrone1dbc382016-07-08 22:24:52 +0200603);
604
Akron03b24db2016-08-16 20:54:32 +0200605# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200606my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200607 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200608 meta_type => $cfg{meta},
609 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200610 foundry => $token_base_foundry,
611 layer => $token_base_layer,
612 gzip => $gzip,
613 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200614 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100615 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200616 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200617 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
618 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200619);
620
Akrone512b7c2020-08-07 16:16:12 +0200621# Auto adjust jobs
622if ($jobs eq '-1') {
Akronebbac2e2024-03-22 10:31:23 +0100623 ($jobs, my $msg) = count_jobs();
624 print $msg . "\n" unless $q;
Akrone512b7c2020-08-07 16:16:12 +0200625};
626
Akron63f20d42017-04-10 23:40:29 +0200627# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200628if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200629
Akron821db3d2017-04-06 21:19:31 +0200630 my @new_input = ();
631
632 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200633 foreach my $wild_card (@input) {
634
635 # Prefix with input root
636 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
637
638 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200639 };
640
Akron63f20d42017-04-10 23:40:29 +0200641 # Sort files by length
642 @input = sort { length($a) <=> length($b) } @new_input;
643
Akrona3518372024-01-22 23:29:00 +0100644 print 'Input is ' . join(', ', @input)."\n" unless $q;
Akron821db3d2017-04-06 21:19:31 +0200645};
646
Akron941c1a62016-02-23 17:41:41 +0100647# Process a single file
648unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100649 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000650
Akron941c1a62016-02-23 17:41:41 +0100651 BEGIN {
652 $main::TIME = Benchmark->new;
653 $main::LAST_STOP = Benchmark->new;
654 };
655
656 sub stop_time {
657 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200658 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100659 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200660 timestr(timediff($new, $main::LAST_STOP)) .
661 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
662 );
Akron941c1a62016-02-23 17:41:41 +0100663 $main::LAST_STOP = $new;
664 };
665
666 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200667 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100668
Akron7d4cdd82016-08-17 21:39:45 +0200669 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200670 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100671
Akron5f51d422016-08-16 16:26:43 +0200672 stop_time;
Akronc0ac4ff2024-04-15 18:03:15 +0200673
Akron3abc03e2017-06-29 16:23:35 +0200674 exit;
Akron81500102017-04-07 20:45:44 +0200675};
676
Nils Diewald59094f22014-11-05 18:20:50 +0000677
Akrone10ad322016-02-27 10:54:26 +0100678# Extract XML files
Akron81500102017-04-07 20:45:44 +0200679if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100680
Akrond5643ad2017-07-04 20:27:13 +0200681 # Output is required
682 pod2usage(%ERROR_HASH) unless $output;
683
Akron7d4cdd82016-08-17 21:39:45 +0200684 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200685 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100686
Akron7d4cdd82016-08-17 21:39:45 +0200687 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100688 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200689 $log->error("Unzip is not installed or incompatible.");
690 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100691 };
692
Akronb0c88db2016-06-29 16:33:18 +0200693 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200694 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200695
Akron31a08cb2019-02-20 20:43:26 +0100696 # Will set @sigle
697 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200698
Akrone10ad322016-02-27 10:54:26 +0100699 # Iterate over all given sigles and extract
700 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100701
Akrona3518372024-01-22 23:29:00 +0100702 unless ($q) {
703 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200704
Akrona3518372024-01-22 23:29:00 +0100705 # TODO: Make this OS independent
706 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100707
Akrona3518372024-01-22 23:29:00 +0100708 # TODO:
709 # - prefix???
710 $archive->extract_sigle(0, [$_], $output, $jobs)
711 ? '' : 'not '
712 );
713 print "extracted.\n";
714 } else {
715 $archive->extract_sigle(1, [$_], $output, $jobs);
716 }
Akrone10ad322016-02-27 10:54:26 +0100717 };
Akronb0c88db2016-06-29 16:33:18 +0200718 }
Akron7d4cdd82016-08-17 21:39:45 +0200719
720 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200721 else {
722 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200723 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100724 };
725}
726
Akron81500102017-04-07 20:45:44 +0200727
Akron941c1a62016-02-23 17:41:41 +0100728# Process an archive
729elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000730
Akron81500102017-04-07 20:45:44 +0200731 my $archive_output;
732
733 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100734 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200735
736 # Create new archive object
737 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
738
739 # Check zip capabilities
740 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200741 $log->error("Unzip is not installed or incompatible.");
742 exit 1;
Akron81500102017-04-07 20:45:44 +0200743 };
744
745 # Add further annotation archived
746 $archive->attach($_) foreach @input[1..$#input];
747
748 # Create a temporary directory
749 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200750 $extract_dir = tempdir(CLEANUP => 0);
Akrona3518372024-01-22 23:29:00 +0100751 print "Temporarily extract to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200752 };
753
Akron63f20d42017-04-10 23:40:29 +0200754 # Add some random extra to avoid clashes with multiple archives
755 $extract_dir = catdir($extract_dir, random_string('cccccc'));
756
Akron31a08cb2019-02-20 20:43:26 +0100757 # Extract to temporary directory
Akrona3518372024-01-22 23:29:00 +0100758 if ($archive->extract_all($q, $extract_dir, $sequential_extraction ? 1: $jobs)) {
759 print "Extract sequentially to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200760 @input = ($extract_dir);
761 }
762 else {
763 $log->error('Unable to extract from primary archive ' . $input[0] .
764 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200765 exit 1;
Akron81500102017-04-07 20:45:44 +0200766 };
767 }
768
769 # Can't create archive object
770 else {
771 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200772 exit 1;
Akron81500102017-04-07 20:45:44 +0200773 };
774 };
775
Akron7d4cdd82016-08-17 21:39:45 +0200776 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100777 my $pool = Parallel::ForkManager->new($jobs);
778
Akron7d4cdd82016-08-17 21:39:45 +0200779 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100780 my $iter = 1; # Current text in process
781
Akronda3097e2017-04-23 19:53:57 +0200782 my $tar_archive;
783 my $output_dir = $output;
784 my $tar_fh;
785
786 # Initialize tar archive
787 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200788
789 # Set output name
790 my $tar_file = $output;
791 unless ($tar_file =~ /\.tar$/) {
792 $tar_file .= '.tar';
793 };
794
795 # Initiate the tar file
Akrona3518372024-01-22 23:29:00 +0100796 print "Writing to file $tar_file\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200797 $tar_fh = IO::File->new($tar_file, 'w');
798 $tar_fh->binmode(1);
799
Akroneb370a02022-02-24 13:33:40 +0100800 # Use tar builder for archiving
801 if (eval("use Archive::Tar::Builder; 1;")) {
802 $tar_archive = Archive::Tar::Builder->new(
803 ignore_errors => 1
804 );
805
806 # Set handle
807 $tar_archive->set_handle($tar_fh);
808 }
809
810 # Fallback solution
811 else {
812 $tar_archive = KorAP::XML::TarBuilder->new(
813 $tar_fh
814 );
815 };
Akronda3097e2017-04-23 19:53:57 +0200816
817 # Output to temporary directory
818 $output_dir = File::Temp->newdir;
819 };
820
Akron941c1a62016-02-23 17:41:41 +0100821 # Report on fork message
822 $pool->run_on_finish (
823 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200824 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100825 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200826
Akrona3518372024-01-22 23:29:00 +0100827 unless ($q) {
828 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
829 $iter . "/$count]" .
830 ($code ? " $code" : '') .
831 ' ' . $data->[0] . "\n";
832 };
833 $iter++;
Akronda3097e2017-04-23 19:53:57 +0200834
835 if (!$code && $to_tar && $data->[2]) {
836 my $filename = $data->[2];
837
838 # Lock filehandle
839 if (flock($tar_fh, LOCK_EX)) {
840
Akron9a062ce2017-07-04 19:12:05 +0200841 my $clean_file = fileparse($filename);
842
Akronda3097e2017-04-23 19:53:57 +0200843 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200844 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200845 unlink $filename;
846
847 # Unlock filehandle
848 flock($tar_fh, LOCK_UN);
849 }
850 else {
851 $log->warn("Unable to add $filename to archive");
852 };
853 };
854
Akron4c0cf312016-10-15 16:42:09 +0200855 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100856 }
857 );
858
859 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200860 my $temp;
Akrona3518372024-01-22 23:29:00 +0100861 print "Reading data ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100862
Akron7d4cdd82016-08-17 21:39:45 +0200863 # unless (Cache::FastMmap->new(
864 # share_file => $cache_file,
865 # cache_size => $cache_size,
866 # init_file => $cache_init
867 # )) {
868 # print "Unable to intialize cache '$cache_file'\n\n";
869 # exit(1);
870 # };
Akron11c80302016-03-18 19:44:43 +0100871
Akron486f9ab2017-04-22 23:25:19 +0200872
Akron941c1a62016-02-23 17:41:41 +0100873 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100874 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200875 # TODO:
876 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100877 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100878 my @dirs;
879 my $dir;
880
Akron7d4cdd82016-08-17 21:39:45 +0200881 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100882 while (1) {
883 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200884 push @dirs, $dir;
885 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100886 };
887 last unless $it->next;
888 };
889
Akrona3518372024-01-22 23:29:00 +0100890 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100891 $t = Benchmark->new;
892 $count = scalar @dirs;
893
894 DIRECTORY_LOOP:
895 for (my $i = 0; $i < $count; $i++) {
896
Akrone1dbc382016-07-08 22:24:52 +0200897 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200898 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200899 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200900 );
Akron941c1a62016-02-23 17:41:41 +0100901
902 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200903 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200904
Akron13d56622016-10-31 14:54:49 +0100905 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200906 $pool->finish(
907 0,
Akronda3097e2017-04-23 19:53:57 +0200908 [
909 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
910 undef,
911 $filename
912 ]
Akron486f9ab2017-04-22 23:25:19 +0200913 );
Akron3ec48972016-08-17 23:24:52 +0200914 }
915 else {
Akron4c0cf312016-10-15 16:42:09 +0200916 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200917 };
Akron941c1a62016-02-23 17:41:41 +0100918 };
919 }
920
921 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200922 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200923
Akron941c1a62016-02-23 17:41:41 +0100924 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200925 $log->error("Unzip is not installed or incompatible.");
926 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100927 };
928
Akron08385f62016-03-22 20:37:04 +0100929 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200930 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100931
Akron31a08cb2019-02-20 20:43:26 +0100932 # Get sigles to extract
933 my $prefix = set_sigle($archive);
934
Akrona3518372024-01-22 23:29:00 +0100935 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100936 $t = Benchmark->new;
937 my @dirs = $archive->list_texts;
938 $count = scalar @dirs;
939
940 ARCHIVE_LOOP:
941 for (my $i = 0; $i < $count; $i++) {
942
943 # Split path information
944 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
945
Akrone1dbc382016-07-08 22:24:52 +0200946 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200947 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200948 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200949 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200950 catfile($corpus, $doc, $text)
951 . '.json' . ($gzip ? '.gz' : '')
952 )
Akrone1dbc382016-07-08 22:24:52 +0200953 );
Akron941c1a62016-02-23 17:41:41 +0100954
955 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200956 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100957
Akron4c0cf312016-10-15 16:42:09 +0200958 # Create temporary file
959 $temp = File::Temp->newdir;
960
Akronbdf434a2016-10-24 17:42:07 +0200961 # TODO: Check if $filename exist at the beginning,
962 # because extraction can be horrible slow!
963
Akron941c1a62016-02-23 17:41:41 +0100964 # Extract from archive
Akrona3518372024-01-22 23:29:00 +0100965 if ($archive->extract_sigle($q, [join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100966
Akron7d4cdd82016-08-17 21:39:45 +0200967 # Create corpus directory
968 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100969
Akron7d4cdd82016-08-17 21:39:45 +0200970 # Temporary directory
971 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100972
Akron7d4cdd82016-08-17 21:39:45 +0200973 # Write file
Akron13d56622016-10-31 14:54:49 +0100974 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200975
Akron4c0cf312016-10-15 16:42:09 +0200976 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100977 $pool->finish(
978 0,
Akronda3097e2017-04-23 19:53:57 +0200979 [
980 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
981 $temp,
982 $filename
983 ]
Akron13d56622016-10-31 14:54:49 +0100984 );
Akron7d4cdd82016-08-17 21:39:45 +0200985 }
986 else {
Akron4c0cf312016-10-15 16:42:09 +0200987 # Delete temporary file
988 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200989 };
Akron941c1a62016-02-23 17:41:41 +0100990 }
Akron7d4cdd82016-08-17 21:39:45 +0200991
992 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100993 else {
Akron4c0cf312016-10-15 16:42:09 +0200994 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100995 };
996 };
997 }
998
999 else {
Akrona3518372024-01-22 23:29:00 +01001000 print "Input is neither a directory nor an archive.\n\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +01001001 };
1002
1003 $pool->wait_all_children;
1004
Akronda3097e2017-04-23 19:53:57 +02001005 # Close tar filehandle
1006 if ($to_tar && $tar_fh) {
1007 $tar_archive->finish;
1008 $tar_fh->close;
Akrona3518372024-01-22 23:29:00 +01001009 print "Wrote to tar archive.\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +02001010 };
Akrona3518372024-01-22 23:29:00 +01001011 unless ($q) {
1012 print timestr(timediff(Benchmark->new, $t))."\n";
1013 print "Done.\n";
1014 };
Akron81500102017-04-07 20:45:44 +02001015};
Akron941c1a62016-02-23 17:41:41 +01001016
Nils Diewald2db9ad02013-10-29 19:26:43 +00001017
Akron31a08cb2019-02-20 20:43:26 +01001018# For an archive, this will create the list
1019# of all sigles to process
1020sub set_sigle {
1021 my $archive = shift;
1022
1023 my $prefix = 1;
1024 my @dirs = ();
1025
1026 # No sigles given
1027 unless (@sigle) {
1028
1029 # Get files
1030 foreach ($archive->list_texts) {
1031
1032 push @dirs, $_;
1033
1034 # Split path information
1035 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1036
1037 # TODO: Make this OS independent
1038 push @sigle, join '/', $corpus, $doc, $text;
1039 };
1040 }
1041
1042 # Check sigle for doc sigles
1043 else {
1044 my @new_sigle;
1045
1046 my $prefix_check = 0;
1047
1048 # Iterate over all sigle
1049 foreach (@sigle) {
1050
1051 # Sigle is a doc sigle
1052 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1053
Akrona3518372024-01-22 23:29:00 +01001054 print "$_ ..." unless $q;
Akron31a08cb2019-02-20 20:43:26 +01001055 # Check if a prefix is needed
1056 unless ($prefix_check) {
1057
Akrona3518372024-01-22 23:29:00 +01001058 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001059 print " with prefix ...";
1060 };
1061 $prefix_check = 1;
1062 };
1063
Akrona3518372024-01-22 23:29:00 +01001064 unless ($q) {
1065 print "\n";
Akron31a08cb2019-02-20 20:43:26 +01001066
Akrona3518372024-01-22 23:29:00 +01001067 print '... ' . (
1068 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
Akron955b75b2019-02-21 14:28:41 +01001069 ? '' : 'not '
Akrona3518372024-01-22 23:29:00 +01001070 );
1071 print "extracted.\n";
1072 }
1073 else {
1074 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
1075 };
Akron31a08cb2019-02-20 20:43:26 +01001076 }
1077
1078 # Sigle is a text sigle
1079 else {
1080 push @new_sigle, $_;
1081
1082 unless ($prefix_check) {
1083
Akrona3518372024-01-22 23:29:00 +01001084 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001085 print " with prefix ...";
1086 };
1087 $prefix_check = 1;
1088 };
1089 };
1090 };
1091 @sigle = @new_sigle;
1092 };
1093
1094 return $prefix;
1095};
1096
1097
Akron63f20d42017-04-10 23:40:29 +02001098# Cleanup temporary extraction directory
1099if ($extract_dir) {
1100 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001101 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001102};
1103
1104
1105print "\n";
1106
Nils Diewald2db9ad02013-10-29 19:26:43 +00001107__END__
Akron941c1a62016-02-23 17:41:41 +01001108
1109=pod
1110
1111=encoding utf8
1112
1113=head1 NAME
1114
Akron42f48c12020-02-14 13:08:13 +01001115korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001116
1117
1118=head1 SYNOPSIS
1119
Akron9cb8c982024-03-22 10:46:56 +01001120 $ korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001121
Akron2fd402b2016-10-27 21:26:48 +02001122
Akron941c1a62016-02-23 17:41:41 +01001123=head1 DESCRIPTION
1124
1125L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1126compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001127The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001128
1129
1130=head1 INSTALLATION
1131
1132The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1133
Akron9cb8c982024-03-22 10:46:56 +01001134 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001135
Akronc13a1702016-03-15 19:33:14 +01001136In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001137be available on your command line immediately.
Akron8ce23f72023-12-13 15:48:49 +01001138Minimum requirement for L<KorAP::XML::Krill> is Perl 5.32.
Akroneb370a02022-02-24 13:33:40 +01001139Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1140Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001141In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001142
1143=head1 ARGUMENTS
1144
Akron9cb8c982024-03-22 10:46:56 +01001145 $ korapxml2krill -z --input <directory> --output <filename>
Akrona76d8352016-10-27 16:27:32 +02001146
1147Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001148It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001149
Akron941c1a62016-02-23 17:41:41 +01001150=over 2
1151
1152=item B<archive>
1153
Akron9cb8c982024-03-22 10:46:56 +01001154 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001155
Akron2fd402b2016-10-27 21:26:48 +02001156Converts an archive of KorAP-XML documents. It expects a directory
1157(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001158
1159=item B<extract>
1160
Akron9cb8c982024-03-22 10:46:56 +01001161 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Akrona76d8352016-10-27 16:27:32 +02001162
1163Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001164
Akron63f20d42017-04-10 23:40:29 +02001165=item B<serial>
1166
Akron9cb8c982024-03-22 10:46:56 +01001167 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Akron63f20d42017-04-10 23:40:29 +02001168
1169Convert archives sequentially. The inputs are not merged but treated
1170as they are (so they may be premerged or globs).
1171the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001172are created based on the archive name. In case the C<--to-tar> flag is given,
1173the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001174
1175
Akron84b53ad2022-01-14 12:39:15 +01001176=item B<slimlog>
1177
Akron9cb8c982024-03-22 10:46:56 +01001178 $ korapxml2krill slimlog <logfile> > <logfile-slim>
Akron84b53ad2022-01-14 12:39:15 +01001179
1180Filters out all useless aka succesfull information from logs, to simplify
1181log checks. Expects no further options.
1182
1183
Akron941c1a62016-02-23 17:41:41 +01001184=back
1185
1186
1187=head1 OPTIONS
1188
1189=over 2
1190
Akrona76d8352016-10-27 16:27:32 +02001191=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001192
Akrona76d8352016-10-27 16:27:32 +02001193Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001194
Akron7606afa2016-10-25 16:23:49 +02001195Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001196document, while C<archive> expects a KorAP-XML corpus folder or a zip
1197file to batch process multiple files.
1198C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001199
Akrona76d8352016-10-27 16:27:32 +02001200C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001201that the first archive listed contains all primary data files
1202and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001203
Akron7606afa2016-10-25 16:23:49 +02001204 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001205
Akron821db3d2017-04-06 21:19:31 +02001206Input may also be defined using BSD glob wildcards.
1207
1208 -i 'file/news*.zip'
1209
1210The extended input array will be sorted in length order, so the shortest
1211path needs to contain all primary data files and all meta data files.
1212
Akron0c3e3752016-06-28 15:55:53 +02001213(The directory structure follows the base directory format,
1214that may include a C<.> root folder.
1215In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001216need to be passed with a hash sign in front of the archive's name.
1217This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001218
Akron7606afa2016-10-25 16:23:49 +02001219To support zip files, a version of C<unzip> needs to be installed that is
1220compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001221
Akron7606afa2016-10-25 16:23:49 +02001222B<The root folder switch using the hash sign is experimental and
1223may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001224
Akronf73ffb62018-06-27 12:13:59 +02001225
Akron63f20d42017-04-10 23:40:29 +02001226=item B<--input-base|-ib> <directory>
1227
1228The base directory for inputs.
1229
1230
Akron941c1a62016-02-23 17:41:41 +01001231=item B<--output|-o> <directory|file>
1232
1233Output folder for archive processing or
1234document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001235writes to C<STDOUT> by default
1236(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001237
1238=item B<--overwrite|-w>
1239
1240Overwrite files that already exist.
1241
Akronf73ffb62018-06-27 12:13:59 +02001242
Akron3741f8b2016-12-21 19:55:21 +01001243=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001244
1245Define the default tokenization by specifying
1246the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001247of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001248This will directly take the file instead of running
1249the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001250
Akron3741f8b2016-12-21 19:55:21 +01001251
1252=item B<--base-sentences|-bs> <foundry>#<layer>
1253
1254Define the layer for base sentences.
1255If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001256Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1257layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001258
1259 Defaults to unset.
1260
1261
1262=item B<--base-paragraphs|-bp> <foundry>#<layer>
1263
1264Define the layer for base paragraphs.
1265If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001266Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1267layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001268
1269 Defaults to unset.
1270
1271
Akron41ac10b2017-02-08 22:47:25 +01001272=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1273
1274Define the layer for base pagebreaks.
1275Currently C<DeReKo#Structure> is the only layer supported.
1276
1277 Defaults to unset.
1278
1279
Akron941c1a62016-02-23 17:41:41 +01001280=item B<--skip|-s> <foundry>[#<layer>]
1281
Akronf7ad89e2016-03-16 18:22:47 +01001282Skip specific annotations by specifying the foundry
1283(and optionally the layer with a C<#>-prefix),
1284e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001285Can be set multiple times.
1286
Akronf73ffb62018-06-27 12:13:59 +02001287
Akronc13a1702016-03-15 19:33:14 +01001288=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001289
Akronf7ad89e2016-03-16 18:22:47 +01001290Convert specific annotations by specifying the foundry
1291(and optionally the layer with a C<#>-prefix),
1292e.g. C<Mate> or C<Mate#Morpho>.
1293Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001294
Akronf73ffb62018-06-27 12:13:59 +02001295
Akroned9baf02019-01-22 17:03:25 +01001296=item B<--non-word-tokens|-nwt>
1297
1298Tokenize non-word tokens like word tokens (defined as matching
1299C</[\d\w]/>). Useful to treat punctuations as tokens.
1300
1301 Defaults to unset.
1302
Akronf1849aa2019-12-16 23:35:33 +01001303
1304=item B<--non-verbal-tokens|-nvt>
1305
1306Tokenize non-verbal tokens marked as in the primary data as
1307the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1308
1309 Defaults to unset.
1310
1311
Akron941c1a62016-02-23 17:41:41 +01001312=item B<--jobs|-j>
1313
Akron29128262024-04-17 15:50:36 +02001314Define the number of spawned forks for concurrent jobs
1315of archive processing.
Akron11c80302016-03-18 19:44:43 +01001316Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001317
Akrona472a242023-02-13 13:46:30 +01001318If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001319also apply to extraction.
1320
Akronebbac2e2024-03-22 10:31:23 +01001321Pass C<-1>, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001322times the number of available cores, in case L<Sys::Info>
Akronebbac2e2024-03-22 10:31:23 +01001323is available and can read CPU count (see C<--job-count>).
1324Be aware, that the report of available cores
Akron29128262024-04-17 15:50:36 +02001325may not work in certain conditions. Benchmarking the processing
1326speed based on the number of jobs may be valuable.
Akronebbac2e2024-03-22 10:31:23 +01001327
Akronf7ad89e2016-03-16 18:22:47 +01001328This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akronebbac2e2024-03-22 10:31:23 +01001331=item B<--job-count|-jc>
1332
1333Print job and core information that would be used if
1334C<-1> was passed to C<--jobs>.
1335
1336
Akron263274c2019-02-07 09:48:30 +01001337=item B<--koral|-k>
1338
1339Version of the output format. Supported versions are:
1340C<0> for legacy serialization, C<0.03> for serialization
1341with metadata fields as key-values on the root object,
1342C<0.4> for serialization with metadata fields as a list
1343of C<"@type":"koral:field"> objects.
1344
1345Currently defaults to C<0.03>.
1346
1347
Akron9ec88872017-04-12 16:29:06 +02001348=item B<--sequential-extraction|-se>
1349
1350Flag to indicate, if the C<jobs> value also applies to extraction.
1351Some systems may have problems with extracting multiple archives
1352to the same folder at the same time.
1353Can be flagged using C<--no-sequential-extraction> as well.
1354Defaults to C<false>.
1355
Akronf73ffb62018-06-27 12:13:59 +02001356
Akron35db6e32016-03-17 22:42:22 +01001357=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001358
Akron35db6e32016-03-17 22:42:22 +01001359Define the metadata parser to use. Defaults to C<I5>.
1360Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1361This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001362
Akronf73ffb62018-06-27 12:13:59 +02001363
Akron941c1a62016-02-23 17:41:41 +01001364=item B<--gzip|-z>
1365
Akronf7ad89e2016-03-16 18:22:47 +01001366Compress the output.
1367Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001368
Akronf73ffb62018-06-27 12:13:59 +02001369
Akron11c80302016-03-18 19:44:43 +01001370=item B<--cache|-c>
1371
1372File to mmap a cache (using L<Cache::FastMmap>).
1373Defaults to C<korapxml2krill.cache> in the calling directory.
1374
Akronf73ffb62018-06-27 12:13:59 +02001375
Akron11c80302016-03-18 19:44:43 +01001376=item B<--cache-size|-cs>
1377
1378Size of the cache. Defaults to C<50m>.
1379
Akronf73ffb62018-06-27 12:13:59 +02001380
Akron11c80302016-03-18 19:44:43 +01001381=item B<--cache-init|-ci>
1382
1383Initialize cache file.
1384Can be flagged using C<--no-cache-init> as well.
1385Defaults to C<true>.
1386
Akronf73ffb62018-06-27 12:13:59 +02001387
Akron11c80302016-03-18 19:44:43 +01001388=item B<--cache-delete|-cd>
1389
1390Delete cache file after processing.
1391Can be flagged using C<--no-cache-delete> as well.
1392Defaults to C<true>.
1393
Akronf73ffb62018-06-27 12:13:59 +02001394
Akron636aa112017-04-07 18:48:56 +02001395=item B<--config|-cfg>
1396
1397Configure the parameters of your call in a file
1398of key-value pairs with whitespace separator
1399
1400 overwrite 1
1401 token DeReKo#Structure
1402 ...
1403
1404Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001405C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akronc0ac4ff2024-04-15 18:03:15 +02001406C<token>, C<log>,
1407C<cache>, C<cache-size>, C<cache-init>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001408C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001409C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001410C<base-sentences>, C<base-paragraphs>,
1411C<base-pagebreaks>,
1412C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001413(semicolon separated), C<anno> (semicolon separated).
1414
Akronf73ffb62018-06-27 12:13:59 +02001415Configuration parameters will always be overwritten by
1416passed parameters.
1417
1418
Akron81500102017-04-07 20:45:44 +02001419=item B<--temporary-extract|-te>
1420
Akrona472a242023-02-13 13:46:30 +01001421Only valid for the C<archive> and C<serial>
1422commands.
Akron81500102017-04-07 20:45:44 +02001423
1424This will first extract all files into a
1425directory and then will archive.
1426If the directory is given as C<:temp:>,
1427a temporary directory is used.
1428This is especially useful to avoid
1429massive unzipping and potential
1430network latency.
Akron636aa112017-04-07 18:48:56 +02001431
Akronf73ffb62018-06-27 12:13:59 +02001432
Akronc93a0802019-07-11 15:48:34 +02001433=item B<--to-tar>
1434
1435Only valid for the C<archive> command.
1436
1437Writes the output into a tar archive.
1438
1439
Akrone10ad322016-02-27 10:54:26 +01001440=item B<--sigle|-sg>
1441
Akron20807582016-10-26 17:11:34 +02001442Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001443Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001444I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001445Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001446In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001447On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001448
Akron64f7fae2022-07-27 12:45:33 +02001449=item B<--lang>
1450
1451Preferred language for metadata fields. In case multiple titles are
1452given (on any level) with different C<xml:lang> attributes,
1453the language given is preferred.
1454Because titles may have different sources and different priorities,
1455non-specific language titles may still be preferred in case the title
1456source has a higher priority.
1457
Akronf73ffb62018-06-27 12:13:59 +02001458
Akron941c1a62016-02-23 17:41:41 +01001459=item B<--log|-l>
1460
Akronb9c33812020-10-21 16:19:35 +02001461The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001462
Akronf73ffb62018-06-27 12:13:59 +02001463
Akrona3518372024-01-22 23:29:00 +01001464=item B<--quiet>
1465
1466Silence all information (non-log) outputs.
1467
1468
Akron941c1a62016-02-23 17:41:41 +01001469=item B<--help|-h>
1470
Akron42f48c12020-02-14 13:08:13 +01001471Print help information.
Akron941c1a62016-02-23 17:41:41 +01001472
Akronf73ffb62018-06-27 12:13:59 +02001473
Akron941c1a62016-02-23 17:41:41 +01001474=item B<--version|-v>
1475
1476Print version information.
1477
1478=back
1479
Akronf73ffb62018-06-27 12:13:59 +02001480
Akronc13a1702016-03-15 19:33:14 +01001481=head1 ANNOTATION SUPPORT
1482
1483L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1484developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1485The base foundry with paragraphs, sentences, and the text element are mandatory for
1486L<Krill|https://github.com/KorAP/Krill>.
1487
Akron821db3d2017-04-06 21:19:31 +02001488 Base
1489 #Paragraphs
1490 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001491
Akron821db3d2017-04-06 21:19:31 +02001492 Connexor
1493 #Morpho
1494 #Phrase
1495 #Sentences
1496 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001497
Akron821db3d2017-04-06 21:19:31 +02001498 CoreNLP
1499 #Constituency
1500 #Morpho
1501 #NamedEntities
1502 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001503
Akronce125b62017-06-19 11:54:36 +02001504 CMC
1505 #Morpho
1506
Akron821db3d2017-04-06 21:19:31 +02001507 DeReKo
1508 #Structure
Akronc13a1702016-03-15 19:33:14 +01001509
Akron57510c12019-01-04 14:58:53 +01001510 DGD
1511 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001512 #Structure
Akron57510c12019-01-04 14:58:53 +01001513
Akron821db3d2017-04-06 21:19:31 +02001514 DRuKoLa
1515 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001516
Akron821db3d2017-04-06 21:19:31 +02001517 Glemm
1518 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001519
Akronabb36902021-10-11 15:51:06 +02001520 Gingko
1521 #Morpho
1522
Akronea1aed52018-07-19 14:43:34 +02001523 HNC
1524 #Morpho
1525
Akron4c679192018-01-16 17:41:49 +01001526 LWC
1527 #Dependency
1528
Akron821db3d2017-04-06 21:19:31 +02001529 Malt
1530 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001531
Akron821db3d2017-04-06 21:19:31 +02001532 MarMoT
1533 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001534
Akron821db3d2017-04-06 21:19:31 +02001535 Mate
1536 #Dependency
1537 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001538
Akron821db3d2017-04-06 21:19:31 +02001539 MDParser
1540 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001541
Akrone85a7762022-07-22 08:05:03 +02001542 NKJP
1543 #Morpho
1544 #NamedEntities
1545
Akron821db3d2017-04-06 21:19:31 +02001546 OpenNLP
1547 #Morpho
1548 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001549
Akron07e24772020-04-23 14:00:54 +02001550 RWK
1551 #Morpho
1552 #Structure
1553
Akron821db3d2017-04-06 21:19:31 +02001554 Sgbr
1555 #Lemma
1556 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001557
Marc Kupietzb8c53822024-03-16 18:54:08 +01001558 Spacy
1559 #Morpho
1560
Akron7d5e6382019-08-08 16:36:27 +02001561 Talismane
1562 #Dependency
1563 #Morpho
1564
Akron821db3d2017-04-06 21:19:31 +02001565 TreeTagger
1566 #Morpho
1567 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001568
Akron83aedd32023-02-07 10:57:41 +01001569 UDPipe
1570 #Dependency
1571 #Morpho
1572
Akron821db3d2017-04-06 21:19:31 +02001573 XIP
1574 #Constituency
1575 #Morpho
1576 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001577
Akronc13a1702016-03-15 19:33:14 +01001578
1579More importers are in preparation.
1580New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1581See the built-in annotation importers as examples.
1582
Akronf73ffb62018-06-27 12:13:59 +02001583
Akron41e6c8b2021-10-14 20:22:18 +02001584=head1 METADATA SUPPORT
1585
1586L<KorAP::XML::Krill> has built-in importer for some meta data variants
1587developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1588
1589=over 2
1590
1591=item I5 - Meta data for all I5 files
1592
1593=item Sgbr - Meta data from the Schreibgebrauch project
1594
1595=item Gingko - Meta data from the Gingko project in addition to I5
1596
Akron2532f1b2023-05-15 13:41:24 +02001597=item ICC - Meta data for the ICC in addition to I5
1598
Akron24ad3c02024-06-03 12:38:20 +02001599=item NKJP - Meta data for the NKJP corpora
1600
Akron41e6c8b2021-10-14 20:22:18 +02001601=back
1602
1603More importers are in preparation.
1604New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1605See the built-in meta data importers as examples.
1606
1607
Akron8f69d632020-01-15 16:58:11 +01001608=head1 About KorAP-XML
1609
1610KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1611data model (Bański et al. 2013), where text data are stored physically
1612separated from their interpretations (i.e. annotations).
1613A text document in KorAP-XML therefore consists of several files
1614containing primary data, metadata and annotations.
1615
1616The structure of a single KorAP-XML document can be as follows:
1617
1618 - data.xml
1619 - header.xml
1620 + base
1621 - tokens.xml
1622 - ...
1623 + struct
1624 - structure.xml
1625 - ...
1626 + corenlp
1627 - morpho.xml
1628 - constituency.xml
1629 - ...
1630 + tree_tagger
1631 - morpho.xml
1632 - ...
1633 - ...
1634
1635The C<data.xml> contains the primary data, the C<header.xml> contains
1636the metadata, and the annotation layers are stored in subfolders
1637like C<base>, C<struct> or C<corenlp>
1638(so-called "foundries"; Bański et al. 2013).
1639
1640Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001641(Lüngen and Sperberg-McQueen 2012). See the documentation in
1642L<KorAP::XML::Meta::I5> for translatable fields.
1643
1644Annotations correspond to a variant of the TEI-P5 feature structures
1645(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001646Annotation feature structures refer to character sequences of the primary text
1647inside the C<text> element of the C<data.xml>.
1648A single annotation containing the lemma of a token can have the following structure:
1649
1650 <span from="0" to="3">
1651 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1652 <f name="lex">
1653 <fs>
1654 <f name="lemma">zum</f>
1655 </fs>
1656 </f>
1657 </fs>
1658 </span>
1659
1660The C<from> and C<to> attributes are refering to the character span
1661in the primary text.
1662Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1663the structure may vary. See L<KorAP::XML::Annotation::*> for various
1664annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001665
1666Multiple KorAP-XML documents are organized on three levels following
1667the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1668corpus E<gt> document E<gt> text. On each level metadata information
1669can be stored, that C<korapxml2krill> will merge to a single metadata
1670object per text. A corpus is therefore structured as follows:
1671
1672 + <corpus>
1673 - header.xml
1674 + <document>
1675 - header.xml
1676 + <text>
1677 - data.xml
1678 - header.xml
1679 - ...
1680 - ...
1681
1682A single text can be identified by the concatenation of
1683the corpus identifier, the document identifier and the text identifier.
1684This identifier is called the text sigle
1685(e.g. a text with the identifier C<18486> in the document C<060> in the
1686corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1687
1688These corpora are often stored in zip files, with which C<korapxml2krill>
1689can deal with. Corpora may also be split in multiple zip archives
1690(e.g. one zip file per foundry), which is also supported (see C<--input>).
1691
1692Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1693in form of a test suite.
1694The resulting JSON format merges all annotation layers
1695based on a single token stream.
1696
1697=head2 References
1698
1699Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1700KorAP data model: first approximation, December.
1701
1702Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1703"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1704Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1705L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1706
1707Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1708"Robust corpus architecture: a new look at virtual collections and data access",
1709Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1710L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1711
1712Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1713Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1714"Towards an international standard on featurestructure representation",
1715Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1716pp. 373-376.
1717L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1718
1719Harald Lüngen and C. M. Sperberg-McQueen (2012):
1720"A TEI P5 Document Grammar for the IDS Text Model",
1721Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1722L<PDF|https://journals.openedition.org/jtei/pdf/508>
1723
1724TEI Consortium, eds:
1725"Feature Structures",
1726Guidelines for Electronic Text Encoding and Interchange.
1727L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1728
Akron941c1a62016-02-23 17:41:41 +01001729=head1 AVAILABILITY
1730
1731 https://github.com/KorAP/KorAP-XML-Krill
1732
1733
1734=head1 COPYRIGHT AND LICENSE
1735
Akrona3518372024-01-22 23:29:00 +01001736Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001737
Akron6882d7d2021-02-08 09:43:57 +01001738Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001739
Akron29128262024-04-17 15:50:36 +02001740Contributor: Eliza Margaretha, Marc Kupietz
Akron941c1a62016-02-23 17:41:41 +01001741
Akron6882d7d2021-02-08 09:43:57 +01001742L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001743Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001744L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001745member of the
Akronf1849aa2019-12-16 23:35:33 +01001746L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001747
1748This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001749L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001750
1751=cut