blob: 51352e86a4b353c2c2aa01de23c37edd1dc57ca7 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Akron941c1a62016-02-23 17:41:41 +0100174# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100175
Akron2532f1b2023-05-15 13:41:24 +0200176our $LAST_CHANGE = '2023/05/16';
Akron941c1a62016-02-23 17:41:41 +0100177our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100178our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100179our $VERSION_MSG = <<"VERSION";
180Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
181VERSION
182
Akron941c1a62016-02-23 17:41:41 +0100183# Parse comand
184my $cmd;
185our @ARGV;
186if ($ARGV[0] && index($ARGV[0], '-') != 0) {
187 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100188};
Akron63f20d42017-04-10 23:40:29 +0200189my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100190
Akron5f51d422016-08-16 16:26:43 +0200191my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200192
193# Configuration hash
194my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100195
Akron941c1a62016-02-23 17:41:41 +0100196# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000197GetOptions(
Akron08385f62016-03-22 20:37:04 +0100198 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200199 'input-base|ib=s' => \($cfg{input_base}),
200 'output|o=s' => \($cfg{output}),
201 'overwrite|w' => \($cfg{overwrite}),
202 'meta|m=s' => \($cfg{meta}),
203 'token|t=s' => \($cfg{token}),
204 'base-sentences|bs=s' => \($cfg{base_sentences}),
205 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
206 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
207 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100208 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100209 'skip|s=s' => \@skip,
210 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200211 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200212 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200213 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200214 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200215 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200216 'primary|p!' => sub {
217 warn 'Primary flag no longer supported!';
218 },
Akrona3518372024-01-22 23:29:00 +0100219 'quiet' => \($cfg{quiet}),
Akron6aed0562020-08-07 16:46:00 +0200220 'pretty|y' => sub {
221 warn 'Pretty flag no longer supported!';
222 },
Akronf8df2162020-08-07 15:03:39 +0200223 'jobs|j=i' => \($cfg{jobs}),
224 'koral|k=f' => \($cfg{koral}),
225 'to-tar' => \($cfg{to_tar}),
226 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
227 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
228 'sequential-extraction|se' => \($cfg{sequential_extraction}),
229 'cache-size|cs=s' => \($cfg{cache_size}),
230 'cache-delete|cd!' => \($cfg{cache_delete}),
231 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100232 'help|h' => sub {
233 pod2usage(
234 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200235 -verbose => 99,
236 -msg => $VERSION_MSG,
237 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100238 );
239 },
240 'version|v' => sub {
241 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200242 -verbose => 0,
243 -msg => $VERSION_MSG,
244 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100245 )
246 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000247);
248
Akrone512b7c2020-08-07 16:16:12 +0200249my %ERROR_HASH = (
250 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
251 -verbose => 99,
252 -msg => $VERSION_MSG,
253 -output => '-',
254 -exit => 1
255);
Akron63f20d42017-04-10 23:40:29 +0200256
Akronf8df2162020-08-07 15:03:39 +0200257# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200258if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200259 my %config;
260
Akronf8df2162020-08-07 15:03:39 +0200261 print "Reading config from $cfg_file\n";
262
Akron636aa112017-04-07 18:48:56 +0200263 Config::Simple->import_from($cfg_file, \%config);
264
Akronf8df2162020-08-07 15:03:39 +0200265 foreach (qw!output cache-size input-base token overwrite
266 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200267 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100268 non-verbal-tokens sequential-extraction
269 temporary-extract cache-init
Akrona3518372024-01-22 23:29:00 +0100270 koral extract-dir jobs quiet!) {
Akronf8df2162020-08-07 15:03:39 +0200271 my $underlined = $_ =~ tr/-/_/r;
272 if (!defined($cfg{$underlined}) && defined $config{$_}) {
273 $cfg{$underlined} = $config{$_};
274 };
Akron636aa112017-04-07 18:48:56 +0200275 };
276
277 # Skip
278 if (!scalar(@skip) && defined $config{'skip'}) {
279 @skip = split /\s*;\s*/, $config{'skip'} ;
280 };
281
282 # Sigle
283 if (!scalar(@sigle) && defined $config{'sigle'}) {
284 @sigle = split /\s*;\s*/, $config{'sigle'} ;
285 };
286
287 # Anno
288 if (!scalar(@anno) && defined $config{'anno'}) {
289 @anno = split /\s*;\s*/, $config{'anno'} ;
290 };
291};
292
Akronf8df2162020-08-07 15:03:39 +0200293# Init variables and set default values
294my $output = $cfg{output};
295my $input_base = $cfg{input_base};
296my $gzip = $cfg{gzip};
297my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100298my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200299my $token_base = $cfg{token} // 'OpenNLP#tokens';
300my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
301my $jobs = $cfg{jobs} // 0;
302my $cache_delete = $cfg{cache_delete} // 1;
303my $base_sentences = lc($cfg{base_sentences} // '');
304my $base_paragraphs = lc($cfg{base_paragraphs} // '');
305my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
306my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akrona3518372024-01-22 23:29:00 +0100307my $q = !!($cfg{quiet}) // 0;
Akron63f20d42017-04-10 23:40:29 +0200308
Akronf8df2162020-08-07 15:03:39 +0200309# Get tokenization basis
310my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200311
Akronf8df2162020-08-07 15:03:39 +0200312# Remove file extension
313$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100314
Akronf8df2162020-08-07 15:03:39 +0200315# Convert sigle to path construct
316s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
317
318my %skip;
319$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200320
Akronb9c33812020-10-21 16:19:35 +0200321Log::Any::Adapter->set(
322 'Stderr', log_level => uc($cfg{log} // 'ERROR')
323);
Akron63f20d42017-04-10 23:40:29 +0200324
Akron84b53ad2022-01-14 12:39:15 +0100325# Start log slimming
326if ($cmd && $cmd eq 'slimlog') {
327 require KorAP::XML::Log::Slim;
328
329 my $log_file = shift @ARGV;
330
331 if (-e $log_file) {
332
333 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
334
335 # Run log filter
336 $slimmer->slim_to;
337 }
338
339 else {
340 warn "Log file can't be found";
341 exit(1);
342 };
343
344 exit;
345};
346
347
Akronf8df2162020-08-07 15:03:39 +0200348if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
349 $log->error("Directory '$output' does not exist.");
350 exit 1;
351};
Akron63f20d42017-04-10 23:40:29 +0200352
Akron941c1a62016-02-23 17:41:41 +0100353# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100354pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000355
Akrone1dbc382016-07-08 22:24:52 +0200356# Gzip has no effect, if no output is given
357pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000358
Akron63f20d42017-04-10 23:40:29 +0200359# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200360if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200361
Akron63f20d42017-04-10 23:40:29 +0200362 # Remove all inputs
363 my $remove_next = 0;
364 @keep_argv = @{c(@keep_argv)->grep(
365 sub {
366 # Input flag
367 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
368 $remove_next = 1;
369 return 0;
370 }
371
372 # input value
373 elsif ($remove_next) {
374 $remove_next = 0;
375 return 0;
376 };
377
378 # Pass parameter
379 return 1;
380 }
381 )->to_array};
382
383
384 # Iterate over all inputs
385 foreach (@input) {
386
Akron081639e2017-04-21 19:01:39 +0200387 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200388 my $new_out = catdir($output, get_file_name_from_glob($_));
389
Akron486f9ab2017-04-22 23:25:19 +0200390 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200391 unless ($to_tar) {
392 if (make_path($new_out) == 0 && !-d $new_out) {
393 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200394 exit 1;
Akron081639e2017-04-21 19:01:39 +0200395 };
Akron63f20d42017-04-10 23:40:29 +0200396 };
397
398 # Create archive command
399 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
Akrona3518372024-01-22 23:29:00 +0100400 unless ($q) {
401 print "Start serial processing of $_ to $new_out\n";
402 print 'Command: ', join(' ', @archive_cmd), "\n";
403 };
Akron63f20d42017-04-10 23:40:29 +0200404
405 # Start archiving
406 system @archive_cmd;
407 };
408
Akron3abc03e2017-06-29 16:23:35 +0200409 exit;
Akron63f20d42017-04-10 23:40:29 +0200410};
411
Akron5c602cb2020-08-07 17:00:52 +0200412# Define supported (and preinstalled) transformation modules
413my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100414push(@layers, ['Base', 'Sentences']) unless $base_sentences;
415push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200416
417# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200418push(@layers, ['Connexor', 'Morpho'],
419 ['Connexor', 'Syntax'],
420 ['Connexor', 'Phrase'],
421 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200422
423# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200424push(@layers,
425 ['CoreNLP', 'NamedEntities'],
426 ['CoreNLP', 'Sentences'],
427 ['CoreNLP', 'Morpho'],
428 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200429
Akronce125b62017-06-19 11:54:36 +0200430# CMC
431push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100432
Akrone1dbc382016-07-08 22:24:52 +0200433# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100434my @dereko_attr = ();
435if ($base_sentences eq 'dereko#structure') {
436 push @dereko_attr, 'sentences';
437};
438if ($base_paragraphs eq 'dereko#structure') {
439 push @dereko_attr, 'paragraphs';
440};
Akron636bd9c2017-02-09 17:13:00 +0100441
Akron41ac10b2017-02-08 22:47:25 +0100442if ($base_pagebreaks eq 'dereko#structure') {
443 push @dereko_attr, 'pagebreaks';
444};
445
446if ($dereko_attr[0]) {
447 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100448}
449else {
450 push(@layers, ['DeReKo', 'Structure']);
451};
Akrone1dbc382016-07-08 22:24:52 +0200452
Akron57510c12019-01-04 14:58:53 +0100453# DGD
454push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100455if ($base_sentences eq 'dgd#structure') {
456 push(@layers, ['DGD', 'Structure', 'base-sentence']);
457}
Akron57510c12019-01-04 14:58:53 +0100458
459# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200460push(@layers,
461 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100462
Akronabb36902021-10-11 15:51:06 +0200463# Gingko
464push(@layers,
465 ['Gingko', 'Morpho']);
466
Akrone1dbc382016-07-08 22:24:52 +0200467# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200468push(@layers,
469 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200470
Akronea1aed52018-07-19 14:43:34 +0200471# HNC
Akron5c602cb2020-08-07 17:00:52 +0200472push(@layers,
473 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200474
Akron4c679192018-01-16 17:41:49 +0100475# LWC
Akron5c602cb2020-08-07 17:00:52 +0200476push(@layers,
477 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100478
Akrone1dbc382016-07-08 22:24:52 +0200479# Malt
Akron5c602cb2020-08-07 17:00:52 +0200480push(@layers,
481 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200482
Akron57510c12019-01-04 14:58:53 +0100483# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200484push(@layers,
485 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200486
487# Mate
Akron5c602cb2020-08-07 17:00:52 +0200488push(@layers,
489 ['Mate', 'Morpho'],
490 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200491
Akron57510c12019-01-04 14:58:53 +0100492# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200493push(@layers,
494 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100495
Akron88d063a2022-03-21 15:10:01 +0100496# NKJP
497push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200498 ['NKJP', 'Morpho'],
499 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100500
Akrone1dbc382016-07-08 22:24:52 +0200501# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200502push(@layers,
503 ['OpenNLP', 'Morpho'],
504 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200505
Akron07e24772020-04-23 14:00:54 +0200506# Redewiedergabe
507push(@layers, ['RWK', 'Morpho']);
508if ($base_sentences eq 'rwk#structure') {
509 push(@layers, ['RWK', 'Structure']);
510};
511
Akrone1dbc382016-07-08 22:24:52 +0200512# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200513push(@layers,
514 ['Sgbr', 'Lemma'],
515 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200516
Akron7d5e6382019-08-08 16:36:27 +0200517# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200518push(@layers,
519 ['Talismane', 'Dependency'],
520 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200521
Akrone1dbc382016-07-08 22:24:52 +0200522# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200523push(@layers,
524 ['TreeTagger', 'Morpho'],
525 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200526
Marc Kupietz400590b2022-12-23 16:02:36 +0100527# UDPipe
528push(@layers,
529 ['UDPipe', 'Morpho'],
530 ['UDPipe', 'Dependency']);
531
Akrone1dbc382016-07-08 22:24:52 +0200532# XIP
Akron5c602cb2020-08-07 17:00:52 +0200533push(@layers,
534 ['XIP', 'Morpho'],
535 ['XIP', 'Constituency'],
536 ['XIP', 'Sentences'],
537 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200538
Akron4fa37c32017-01-20 14:43:10 +0100539
Akrone1dbc382016-07-08 22:24:52 +0200540# Check filters
541my @filtered_anno;
542if ($skip{'#all'}) {
543 foreach (@anno) {
544 push @filtered_anno, [ split('#', $_) ];
545 };
546}
547
548# Add all annotations that are not skipped
549else {
550 # Add to index file - respect skipping
551 foreach my $info (@layers) {
552 # Skip if Foundry or Foundry#Layer should be skipped
553 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
554 push @filtered_anno, $info;
555 };
556 };
557};
558
Akrone1dbc382016-07-08 22:24:52 +0200559
560# TODO: This should not be initialized for batch
561my $cache = Cache::FastMmap->new(
562 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200563 cache_size => ($cfg{cache_size} // '50m'),
564 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200565);
566
Akron03b24db2016-08-16 20:54:32 +0200567# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200568my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200569 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200570 meta_type => $cfg{meta},
571 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200572 foundry => $token_base_foundry,
573 layer => $token_base_layer,
574 gzip => $gzip,
575 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200576 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100577 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200578 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200579 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
580 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200581);
582
Akrone512b7c2020-08-07 16:16:12 +0200583# Auto adjust jobs
584if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100585 my $cores = 1;
586 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
587 $cores = Sys::Info->new->device('CPU')->count;
588 }
589 else {
590 $log->warn("Unable to determine number of cores");
591 };
592
Akrone512b7c2020-08-07 16:16:12 +0200593 $jobs = ceil(5 * $cores);
594 $log->info("Run using $jobs jobs on $cores cores");
595};
596
597
Akron63f20d42017-04-10 23:40:29 +0200598# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200599if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200600
Akron821db3d2017-04-06 21:19:31 +0200601 my @new_input = ();
602
603 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200604 foreach my $wild_card (@input) {
605
606 # Prefix with input root
607 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
608
609 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200610 };
611
Akron63f20d42017-04-10 23:40:29 +0200612 # Sort files by length
613 @input = sort { length($a) <=> length($b) } @new_input;
614
Akrona3518372024-01-22 23:29:00 +0100615 print 'Input is ' . join(', ', @input)."\n" unless $q;
Akron821db3d2017-04-06 21:19:31 +0200616};
617
618
Akron941c1a62016-02-23 17:41:41 +0100619# Process a single file
620unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100621 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000622
Akron941c1a62016-02-23 17:41:41 +0100623 BEGIN {
624 $main::TIME = Benchmark->new;
625 $main::LAST_STOP = Benchmark->new;
626 };
627
628 sub stop_time {
629 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200630 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100631 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200632 timestr(timediff($new, $main::LAST_STOP)) .
633 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
634 );
Akron941c1a62016-02-23 17:41:41 +0100635 $main::LAST_STOP = $new;
636 };
637
638 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200639 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100640
Akron7d4cdd82016-08-17 21:39:45 +0200641 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200642 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100643
Akron11c80302016-03-18 19:44:43 +0100644 # Delete cache file
645 unlink($cache_file) if $cache_delete;
646
Akron5f51d422016-08-16 16:26:43 +0200647 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200648 exit;
Akron81500102017-04-07 20:45:44 +0200649};
650
Nils Diewald59094f22014-11-05 18:20:50 +0000651
Akrone10ad322016-02-27 10:54:26 +0100652# Extract XML files
Akron81500102017-04-07 20:45:44 +0200653if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100654
Akrond5643ad2017-07-04 20:27:13 +0200655 # Output is required
656 pod2usage(%ERROR_HASH) unless $output;
657
Akron7d4cdd82016-08-17 21:39:45 +0200658 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200659 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100660
Akron7d4cdd82016-08-17 21:39:45 +0200661 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100662 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200663 $log->error("Unzip is not installed or incompatible.");
664 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100665 };
666
Akronb0c88db2016-06-29 16:33:18 +0200667 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200668 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200669
Akron31a08cb2019-02-20 20:43:26 +0100670 # Will set @sigle
671 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200672
Akrone10ad322016-02-27 10:54:26 +0100673 # Iterate over all given sigles and extract
674 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100675
Akrona3518372024-01-22 23:29:00 +0100676 unless ($q) {
677 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200678
Akrona3518372024-01-22 23:29:00 +0100679 # TODO: Make this OS independent
680 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100681
Akrona3518372024-01-22 23:29:00 +0100682 # TODO:
683 # - prefix???
684 $archive->extract_sigle(0, [$_], $output, $jobs)
685 ? '' : 'not '
686 );
687 print "extracted.\n";
688 } else {
689 $archive->extract_sigle(1, [$_], $output, $jobs);
690 }
Akrone10ad322016-02-27 10:54:26 +0100691 };
Akronb0c88db2016-06-29 16:33:18 +0200692 }
Akron7d4cdd82016-08-17 21:39:45 +0200693
694 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200695 else {
696 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200697 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100698 };
699}
700
Akron81500102017-04-07 20:45:44 +0200701
Akron941c1a62016-02-23 17:41:41 +0100702# Process an archive
703elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000704
Akron81500102017-04-07 20:45:44 +0200705 my $archive_output;
706
707 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100708 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200709
710 # Create new archive object
711 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
712
713 # Check zip capabilities
714 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200715 $log->error("Unzip is not installed or incompatible.");
716 exit 1;
Akron81500102017-04-07 20:45:44 +0200717 };
718
719 # Add further annotation archived
720 $archive->attach($_) foreach @input[1..$#input];
721
722 # Create a temporary directory
723 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200724 $extract_dir = tempdir(CLEANUP => 0);
Akrona3518372024-01-22 23:29:00 +0100725 print "Temporarily extract to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200726 };
727
Akron63f20d42017-04-10 23:40:29 +0200728 # Add some random extra to avoid clashes with multiple archives
729 $extract_dir = catdir($extract_dir, random_string('cccccc'));
730
Akron31a08cb2019-02-20 20:43:26 +0100731 # Extract to temporary directory
Akrona3518372024-01-22 23:29:00 +0100732 if ($archive->extract_all($q, $extract_dir, $sequential_extraction ? 1: $jobs)) {
733 print "Extract sequentially to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200734 @input = ($extract_dir);
735 }
736 else {
737 $log->error('Unable to extract from primary archive ' . $input[0] .
738 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200739 exit 1;
Akron81500102017-04-07 20:45:44 +0200740 };
741 }
742
743 # Can't create archive object
744 else {
745 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200746 exit 1;
Akron81500102017-04-07 20:45:44 +0200747 };
748 };
749
Akron7d4cdd82016-08-17 21:39:45 +0200750 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100751 my $pool = Parallel::ForkManager->new($jobs);
752
Akron7d4cdd82016-08-17 21:39:45 +0200753 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100754 my $iter = 1; # Current text in process
755
Akronda3097e2017-04-23 19:53:57 +0200756 my $tar_archive;
757 my $output_dir = $output;
758 my $tar_fh;
759
760 # Initialize tar archive
761 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200762
763 # Set output name
764 my $tar_file = $output;
765 unless ($tar_file =~ /\.tar$/) {
766 $tar_file .= '.tar';
767 };
768
769 # Initiate the tar file
Akrona3518372024-01-22 23:29:00 +0100770 print "Writing to file $tar_file\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200771 $tar_fh = IO::File->new($tar_file, 'w');
772 $tar_fh->binmode(1);
773
Akroneb370a02022-02-24 13:33:40 +0100774 # Use tar builder for archiving
775 if (eval("use Archive::Tar::Builder; 1;")) {
776 $tar_archive = Archive::Tar::Builder->new(
777 ignore_errors => 1
778 );
779
780 # Set handle
781 $tar_archive->set_handle($tar_fh);
782 }
783
784 # Fallback solution
785 else {
786 $tar_archive = KorAP::XML::TarBuilder->new(
787 $tar_fh
788 );
789 };
Akronda3097e2017-04-23 19:53:57 +0200790
791 # Output to temporary directory
792 $output_dir = File::Temp->newdir;
793 };
794
Akron941c1a62016-02-23 17:41:41 +0100795 # Report on fork message
796 $pool->run_on_finish (
797 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200798 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100799 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200800
Akrona3518372024-01-22 23:29:00 +0100801 unless ($q) {
802 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
803 $iter . "/$count]" .
804 ($code ? " $code" : '') .
805 ' ' . $data->[0] . "\n";
806 };
807 $iter++;
Akronda3097e2017-04-23 19:53:57 +0200808
809 if (!$code && $to_tar && $data->[2]) {
810 my $filename = $data->[2];
811
812 # Lock filehandle
813 if (flock($tar_fh, LOCK_EX)) {
814
Akron9a062ce2017-07-04 19:12:05 +0200815 my $clean_file = fileparse($filename);
816
Akronda3097e2017-04-23 19:53:57 +0200817 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200818 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200819 unlink $filename;
820
821 # Unlock filehandle
822 flock($tar_fh, LOCK_UN);
823 }
824 else {
825 $log->warn("Unable to add $filename to archive");
826 };
827 };
828
Akron4c0cf312016-10-15 16:42:09 +0200829 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100830 }
831 );
832
833 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200834 my $temp;
Akrona3518372024-01-22 23:29:00 +0100835 print "Reading data ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100836
Akron7d4cdd82016-08-17 21:39:45 +0200837 # unless (Cache::FastMmap->new(
838 # share_file => $cache_file,
839 # cache_size => $cache_size,
840 # init_file => $cache_init
841 # )) {
842 # print "Unable to intialize cache '$cache_file'\n\n";
843 # exit(1);
844 # };
Akron11c80302016-03-18 19:44:43 +0100845
Akron486f9ab2017-04-22 23:25:19 +0200846
Akron941c1a62016-02-23 17:41:41 +0100847 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100848 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200849 # TODO:
850 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100851 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100852 my @dirs;
853 my $dir;
854
Akron7d4cdd82016-08-17 21:39:45 +0200855 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100856 while (1) {
857 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200858 push @dirs, $dir;
859 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100860 };
861 last unless $it->next;
862 };
863
Akrona3518372024-01-22 23:29:00 +0100864 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100865 $t = Benchmark->new;
866 $count = scalar @dirs;
867
868 DIRECTORY_LOOP:
869 for (my $i = 0; $i < $count; $i++) {
870
Akrone1dbc382016-07-08 22:24:52 +0200871 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200872 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200873 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200874 );
Akron941c1a62016-02-23 17:41:41 +0100875
876 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200877 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200878
Akron13d56622016-10-31 14:54:49 +0100879 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200880 $pool->finish(
881 0,
Akronda3097e2017-04-23 19:53:57 +0200882 [
883 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
884 undef,
885 $filename
886 ]
Akron486f9ab2017-04-22 23:25:19 +0200887 );
Akron3ec48972016-08-17 23:24:52 +0200888 }
889 else {
Akron4c0cf312016-10-15 16:42:09 +0200890 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200891 };
Akron941c1a62016-02-23 17:41:41 +0100892 };
893 }
894
895 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200896 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200897
Akron941c1a62016-02-23 17:41:41 +0100898 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200899 $log->error("Unzip is not installed or incompatible.");
900 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100901 };
902
Akron08385f62016-03-22 20:37:04 +0100903 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200904 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100905
Akron31a08cb2019-02-20 20:43:26 +0100906 # Get sigles to extract
907 my $prefix = set_sigle($archive);
908
Akrona3518372024-01-22 23:29:00 +0100909 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100910 $t = Benchmark->new;
911 my @dirs = $archive->list_texts;
912 $count = scalar @dirs;
913
914 ARCHIVE_LOOP:
915 for (my $i = 0; $i < $count; $i++) {
916
917 # Split path information
918 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
919
Akrone1dbc382016-07-08 22:24:52 +0200920 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200921 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200922 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200923 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200924 catfile($corpus, $doc, $text)
925 . '.json' . ($gzip ? '.gz' : '')
926 )
Akrone1dbc382016-07-08 22:24:52 +0200927 );
Akron941c1a62016-02-23 17:41:41 +0100928
929 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200930 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100931
Akron4c0cf312016-10-15 16:42:09 +0200932 # Create temporary file
933 $temp = File::Temp->newdir;
934
Akronbdf434a2016-10-24 17:42:07 +0200935 # TODO: Check if $filename exist at the beginning,
936 # because extraction can be horrible slow!
937
Akron941c1a62016-02-23 17:41:41 +0100938 # Extract from archive
Akrona3518372024-01-22 23:29:00 +0100939 if ($archive->extract_sigle($q, [join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100940
Akron7d4cdd82016-08-17 21:39:45 +0200941 # Create corpus directory
942 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100943
Akron7d4cdd82016-08-17 21:39:45 +0200944 # Temporary directory
945 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100946
Akron7d4cdd82016-08-17 21:39:45 +0200947 # Write file
Akron13d56622016-10-31 14:54:49 +0100948 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200949
Akron4c0cf312016-10-15 16:42:09 +0200950 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100951 $pool->finish(
952 0,
Akronda3097e2017-04-23 19:53:57 +0200953 [
954 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
955 $temp,
956 $filename
957 ]
Akron13d56622016-10-31 14:54:49 +0100958 );
Akron7d4cdd82016-08-17 21:39:45 +0200959 }
960 else {
Akron4c0cf312016-10-15 16:42:09 +0200961 # Delete temporary file
962 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200963 };
Akron941c1a62016-02-23 17:41:41 +0100964 }
Akron7d4cdd82016-08-17 21:39:45 +0200965
966 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100967 else {
Akron4c0cf312016-10-15 16:42:09 +0200968 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100969 };
970 };
971 }
972
973 else {
Akrona3518372024-01-22 23:29:00 +0100974 print "Input is neither a directory nor an archive.\n\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100975 };
976
977 $pool->wait_all_children;
978
Akron11c80302016-03-18 19:44:43 +0100979 # Delete cache file
980 unlink($cache_file) if $cache_delete;
981
Akronda3097e2017-04-23 19:53:57 +0200982 # Close tar filehandle
983 if ($to_tar && $tar_fh) {
984 $tar_archive->finish;
985 $tar_fh->close;
Akrona3518372024-01-22 23:29:00 +0100986 print "Wrote to tar archive.\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200987 };
Akrona3518372024-01-22 23:29:00 +0100988 unless ($q) {
989 print timestr(timediff(Benchmark->new, $t))."\n";
990 print "Done.\n";
991 };
Akron81500102017-04-07 20:45:44 +0200992};
Akron941c1a62016-02-23 17:41:41 +0100993
Nils Diewald2db9ad02013-10-29 19:26:43 +0000994
Akron31a08cb2019-02-20 20:43:26 +0100995# For an archive, this will create the list
996# of all sigles to process
997sub set_sigle {
998 my $archive = shift;
999
1000 my $prefix = 1;
1001 my @dirs = ();
1002
1003 # No sigles given
1004 unless (@sigle) {
1005
1006 # Get files
1007 foreach ($archive->list_texts) {
1008
1009 push @dirs, $_;
1010
1011 # Split path information
1012 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1013
1014 # TODO: Make this OS independent
1015 push @sigle, join '/', $corpus, $doc, $text;
1016 };
1017 }
1018
1019 # Check sigle for doc sigles
1020 else {
1021 my @new_sigle;
1022
1023 my $prefix_check = 0;
1024
1025 # Iterate over all sigle
1026 foreach (@sigle) {
1027
1028 # Sigle is a doc sigle
1029 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1030
Akrona3518372024-01-22 23:29:00 +01001031 print "$_ ..." unless $q;
Akron31a08cb2019-02-20 20:43:26 +01001032 # Check if a prefix is needed
1033 unless ($prefix_check) {
1034
Akrona3518372024-01-22 23:29:00 +01001035 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001036 print " with prefix ...";
1037 };
1038 $prefix_check = 1;
1039 };
1040
Akrona3518372024-01-22 23:29:00 +01001041 unless ($q) {
1042 print "\n";
Akron31a08cb2019-02-20 20:43:26 +01001043
Akrona3518372024-01-22 23:29:00 +01001044 print '... ' . (
1045 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
Akron955b75b2019-02-21 14:28:41 +01001046 ? '' : 'not '
Akrona3518372024-01-22 23:29:00 +01001047 );
1048 print "extracted.\n";
1049 }
1050 else {
1051 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
1052 };
Akron31a08cb2019-02-20 20:43:26 +01001053 }
1054
1055 # Sigle is a text sigle
1056 else {
1057 push @new_sigle, $_;
1058
1059 unless ($prefix_check) {
1060
Akrona3518372024-01-22 23:29:00 +01001061 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001062 print " with prefix ...";
1063 };
1064 $prefix_check = 1;
1065 };
1066 };
1067 };
1068 @sigle = @new_sigle;
1069 };
1070
1071 return $prefix;
1072};
1073
1074
Akron63f20d42017-04-10 23:40:29 +02001075# Cleanup temporary extraction directory
1076if ($extract_dir) {
1077 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001078 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001079};
1080
1081
1082print "\n";
1083
Nils Diewald2db9ad02013-10-29 19:26:43 +00001084__END__
Akron941c1a62016-02-23 17:41:41 +01001085
1086=pod
1087
1088=encoding utf8
1089
1090=head1 NAME
1091
Akron42f48c12020-02-14 13:08:13 +01001092korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001093
1094
1095=head1 SYNOPSIS
1096
Akrona76d8352016-10-27 16:27:32 +02001097 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001098
Akron2fd402b2016-10-27 21:26:48 +02001099
Akron941c1a62016-02-23 17:41:41 +01001100=head1 DESCRIPTION
1101
1102L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1103compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001104The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001105
1106
1107=head1 INSTALLATION
1108
1109The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1110
Akronaf386982016-10-12 00:33:25 +02001111 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001112
Akronc13a1702016-03-15 19:33:14 +01001113In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001114be available on your command line immediately.
Akron8ce23f72023-12-13 15:48:49 +01001115Minimum requirement for L<KorAP::XML::Krill> is Perl 5.32.
Akroneb370a02022-02-24 13:33:40 +01001116Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1117Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001118In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001119
1120=head1 ARGUMENTS
1121
Akrona76d8352016-10-27 16:27:32 +02001122 $ korapxml2krill -z --input <directory> --output <filename>
1123
1124Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001125It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001126
Akron941c1a62016-02-23 17:41:41 +01001127=over 2
1128
1129=item B<archive>
1130
Akron081639e2017-04-21 19:01:39 +02001131 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001132
Akron2fd402b2016-10-27 21:26:48 +02001133Converts an archive of KorAP-XML documents. It expects a directory
1134(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001135
1136=item B<extract>
1137
Akrona76d8352016-10-27 16:27:32 +02001138 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1139
1140Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001141
Akron63f20d42017-04-10 23:40:29 +02001142=item B<serial>
1143
1144 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1145
1146Convert archives sequentially. The inputs are not merged but treated
1147as they are (so they may be premerged or globs).
1148the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001149are created based on the archive name. In case the C<--to-tar> flag is given,
1150the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001151
1152
Akron84b53ad2022-01-14 12:39:15 +01001153=item B<slimlog>
1154
1155 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1156
1157Filters out all useless aka succesfull information from logs, to simplify
1158log checks. Expects no further options.
1159
1160
Akron941c1a62016-02-23 17:41:41 +01001161=back
1162
1163
1164=head1 OPTIONS
1165
1166=over 2
1167
Akrona76d8352016-10-27 16:27:32 +02001168=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001169
Akrona76d8352016-10-27 16:27:32 +02001170Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001171
Akron7606afa2016-10-25 16:23:49 +02001172Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001173document, while C<archive> expects a KorAP-XML corpus folder or a zip
1174file to batch process multiple files.
1175C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001176
Akrona76d8352016-10-27 16:27:32 +02001177C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001178that the first archive listed contains all primary data files
1179and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001180
Akron7606afa2016-10-25 16:23:49 +02001181 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001182
Akron821db3d2017-04-06 21:19:31 +02001183Input may also be defined using BSD glob wildcards.
1184
1185 -i 'file/news*.zip'
1186
1187The extended input array will be sorted in length order, so the shortest
1188path needs to contain all primary data files and all meta data files.
1189
Akron0c3e3752016-06-28 15:55:53 +02001190(The directory structure follows the base directory format,
1191that may include a C<.> root folder.
1192In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001193need to be passed with a hash sign in front of the archive's name.
1194This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001195
Akron7606afa2016-10-25 16:23:49 +02001196To support zip files, a version of C<unzip> needs to be installed that is
1197compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001198
Akron7606afa2016-10-25 16:23:49 +02001199B<The root folder switch using the hash sign is experimental and
1200may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001201
Akronf73ffb62018-06-27 12:13:59 +02001202
Akron63f20d42017-04-10 23:40:29 +02001203=item B<--input-base|-ib> <directory>
1204
1205The base directory for inputs.
1206
1207
Akron941c1a62016-02-23 17:41:41 +01001208=item B<--output|-o> <directory|file>
1209
1210Output folder for archive processing or
1211document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001212writes to C<STDOUT> by default
1213(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001214
1215=item B<--overwrite|-w>
1216
1217Overwrite files that already exist.
1218
Akronf73ffb62018-06-27 12:13:59 +02001219
Akron3741f8b2016-12-21 19:55:21 +01001220=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001221
1222Define the default tokenization by specifying
1223the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001224of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001225This will directly take the file instead of running
1226the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001227
Akron3741f8b2016-12-21 19:55:21 +01001228
1229=item B<--base-sentences|-bs> <foundry>#<layer>
1230
1231Define the layer for base sentences.
1232If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001233Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1234layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001235
1236 Defaults to unset.
1237
1238
1239=item B<--base-paragraphs|-bp> <foundry>#<layer>
1240
1241Define the layer for base paragraphs.
1242If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001243Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1244layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001245
1246 Defaults to unset.
1247
1248
Akron41ac10b2017-02-08 22:47:25 +01001249=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1250
1251Define the layer for base pagebreaks.
1252Currently C<DeReKo#Structure> is the only layer supported.
1253
1254 Defaults to unset.
1255
1256
Akron941c1a62016-02-23 17:41:41 +01001257=item B<--skip|-s> <foundry>[#<layer>]
1258
Akronf7ad89e2016-03-16 18:22:47 +01001259Skip specific annotations by specifying the foundry
1260(and optionally the layer with a C<#>-prefix),
1261e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001262Can be set multiple times.
1263
Akronf73ffb62018-06-27 12:13:59 +02001264
Akronc13a1702016-03-15 19:33:14 +01001265=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001266
Akronf7ad89e2016-03-16 18:22:47 +01001267Convert specific annotations by specifying the foundry
1268(and optionally the layer with a C<#>-prefix),
1269e.g. C<Mate> or C<Mate#Morpho>.
1270Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001271
Akronf73ffb62018-06-27 12:13:59 +02001272
Akroned9baf02019-01-22 17:03:25 +01001273=item B<--non-word-tokens|-nwt>
1274
1275Tokenize non-word tokens like word tokens (defined as matching
1276C</[\d\w]/>). Useful to treat punctuations as tokens.
1277
1278 Defaults to unset.
1279
Akronf1849aa2019-12-16 23:35:33 +01001280
1281=item B<--non-verbal-tokens|-nvt>
1282
1283Tokenize non-verbal tokens marked as in the primary data as
1284the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1285
1286 Defaults to unset.
1287
1288
Akron941c1a62016-02-23 17:41:41 +01001289=item B<--jobs|-j>
1290
1291Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001292for archive processing.
Akron11c80302016-03-18 19:44:43 +01001293Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001294
Akrona472a242023-02-13 13:46:30 +01001295If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001296also apply to extraction.
1297
Akronc11f7982017-02-21 21:20:14 +01001298Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001299times the number of available cores, in case L<Sys::Info>
1300is available.
Akronf7ad89e2016-03-16 18:22:47 +01001301This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001302
Akronf73ffb62018-06-27 12:13:59 +02001303
Akron263274c2019-02-07 09:48:30 +01001304=item B<--koral|-k>
1305
1306Version of the output format. Supported versions are:
1307C<0> for legacy serialization, C<0.03> for serialization
1308with metadata fields as key-values on the root object,
1309C<0.4> for serialization with metadata fields as a list
1310of C<"@type":"koral:field"> objects.
1311
1312Currently defaults to C<0.03>.
1313
1314
Akron9ec88872017-04-12 16:29:06 +02001315=item B<--sequential-extraction|-se>
1316
1317Flag to indicate, if the C<jobs> value also applies to extraction.
1318Some systems may have problems with extracting multiple archives
1319to the same folder at the same time.
1320Can be flagged using C<--no-sequential-extraction> as well.
1321Defaults to C<false>.
1322
Akronf73ffb62018-06-27 12:13:59 +02001323
Akron35db6e32016-03-17 22:42:22 +01001324=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001325
Akron35db6e32016-03-17 22:42:22 +01001326Define the metadata parser to use. Defaults to C<I5>.
1327Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1328This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akron941c1a62016-02-23 17:41:41 +01001331=item B<--gzip|-z>
1332
Akronf7ad89e2016-03-16 18:22:47 +01001333Compress the output.
1334Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001335
Akronf73ffb62018-06-27 12:13:59 +02001336
Akron11c80302016-03-18 19:44:43 +01001337=item B<--cache|-c>
1338
1339File to mmap a cache (using L<Cache::FastMmap>).
1340Defaults to C<korapxml2krill.cache> in the calling directory.
1341
Akronf73ffb62018-06-27 12:13:59 +02001342
Akron11c80302016-03-18 19:44:43 +01001343=item B<--cache-size|-cs>
1344
1345Size of the cache. Defaults to C<50m>.
1346
Akronf73ffb62018-06-27 12:13:59 +02001347
Akron11c80302016-03-18 19:44:43 +01001348=item B<--cache-init|-ci>
1349
1350Initialize cache file.
1351Can be flagged using C<--no-cache-init> as well.
1352Defaults to C<true>.
1353
Akronf73ffb62018-06-27 12:13:59 +02001354
Akron11c80302016-03-18 19:44:43 +01001355=item B<--cache-delete|-cd>
1356
1357Delete cache file after processing.
1358Can be flagged using C<--no-cache-delete> as well.
1359Defaults to C<true>.
1360
Akronf73ffb62018-06-27 12:13:59 +02001361
Akron636aa112017-04-07 18:48:56 +02001362=item B<--config|-cfg>
1363
1364Configure the parameters of your call in a file
1365of key-value pairs with whitespace separator
1366
1367 overwrite 1
1368 token DeReKo#Structure
1369 ...
1370
1371Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001372C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001373C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001374C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001375C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001376C<base-sentences>, C<base-paragraphs>,
1377C<base-pagebreaks>,
1378C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001379(semicolon separated), C<anno> (semicolon separated).
1380
Akronf73ffb62018-06-27 12:13:59 +02001381Configuration parameters will always be overwritten by
1382passed parameters.
1383
1384
Akron81500102017-04-07 20:45:44 +02001385=item B<--temporary-extract|-te>
1386
Akrona472a242023-02-13 13:46:30 +01001387Only valid for the C<archive> and C<serial>
1388commands.
Akron81500102017-04-07 20:45:44 +02001389
1390This will first extract all files into a
1391directory and then will archive.
1392If the directory is given as C<:temp:>,
1393a temporary directory is used.
1394This is especially useful to avoid
1395massive unzipping and potential
1396network latency.
Akron636aa112017-04-07 18:48:56 +02001397
Akronf73ffb62018-06-27 12:13:59 +02001398
Akronc93a0802019-07-11 15:48:34 +02001399=item B<--to-tar>
1400
1401Only valid for the C<archive> command.
1402
1403Writes the output into a tar archive.
1404
1405
Akrone10ad322016-02-27 10:54:26 +01001406=item B<--sigle|-sg>
1407
Akron20807582016-10-26 17:11:34 +02001408Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001409Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001410I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001411Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001412In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001413On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001414
Akron64f7fae2022-07-27 12:45:33 +02001415=item B<--lang>
1416
1417Preferred language for metadata fields. In case multiple titles are
1418given (on any level) with different C<xml:lang> attributes,
1419the language given is preferred.
1420Because titles may have different sources and different priorities,
1421non-specific language titles may still be preferred in case the title
1422source has a higher priority.
1423
Akronf73ffb62018-06-27 12:13:59 +02001424
Akron941c1a62016-02-23 17:41:41 +01001425=item B<--log|-l>
1426
Akronb9c33812020-10-21 16:19:35 +02001427The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001428
Akronf73ffb62018-06-27 12:13:59 +02001429
Akrona3518372024-01-22 23:29:00 +01001430=item B<--quiet>
1431
1432Silence all information (non-log) outputs.
1433
1434
Akron941c1a62016-02-23 17:41:41 +01001435=item B<--help|-h>
1436
Akron42f48c12020-02-14 13:08:13 +01001437Print help information.
Akron941c1a62016-02-23 17:41:41 +01001438
Akronf73ffb62018-06-27 12:13:59 +02001439
Akron941c1a62016-02-23 17:41:41 +01001440=item B<--version|-v>
1441
1442Print version information.
1443
1444=back
1445
Akronf73ffb62018-06-27 12:13:59 +02001446
Akronc13a1702016-03-15 19:33:14 +01001447=head1 ANNOTATION SUPPORT
1448
1449L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1450developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1451The base foundry with paragraphs, sentences, and the text element are mandatory for
1452L<Krill|https://github.com/KorAP/Krill>.
1453
Akron821db3d2017-04-06 21:19:31 +02001454 Base
1455 #Paragraphs
1456 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001457
Akron821db3d2017-04-06 21:19:31 +02001458 Connexor
1459 #Morpho
1460 #Phrase
1461 #Sentences
1462 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001463
Akron821db3d2017-04-06 21:19:31 +02001464 CoreNLP
1465 #Constituency
1466 #Morpho
1467 #NamedEntities
1468 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001469
Akronce125b62017-06-19 11:54:36 +02001470 CMC
1471 #Morpho
1472
Akron821db3d2017-04-06 21:19:31 +02001473 DeReKo
1474 #Structure
Akronc13a1702016-03-15 19:33:14 +01001475
Akron57510c12019-01-04 14:58:53 +01001476 DGD
1477 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001478 #Structure
Akron57510c12019-01-04 14:58:53 +01001479
Akron821db3d2017-04-06 21:19:31 +02001480 DRuKoLa
1481 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001482
Akron821db3d2017-04-06 21:19:31 +02001483 Glemm
1484 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001485
Akronabb36902021-10-11 15:51:06 +02001486 Gingko
1487 #Morpho
1488
Akronea1aed52018-07-19 14:43:34 +02001489 HNC
1490 #Morpho
1491
Akron4c679192018-01-16 17:41:49 +01001492 LWC
1493 #Dependency
1494
Akron821db3d2017-04-06 21:19:31 +02001495 Malt
1496 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001497
Akron821db3d2017-04-06 21:19:31 +02001498 MarMoT
1499 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001500
Akron821db3d2017-04-06 21:19:31 +02001501 Mate
1502 #Dependency
1503 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001504
Akron821db3d2017-04-06 21:19:31 +02001505 MDParser
1506 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001507
Akrone85a7762022-07-22 08:05:03 +02001508 NKJP
1509 #Morpho
1510 #NamedEntities
1511
Akron821db3d2017-04-06 21:19:31 +02001512 OpenNLP
1513 #Morpho
1514 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001515
Akron07e24772020-04-23 14:00:54 +02001516 RWK
1517 #Morpho
1518 #Structure
1519
Akron821db3d2017-04-06 21:19:31 +02001520 Sgbr
1521 #Lemma
1522 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001523
Akron7d5e6382019-08-08 16:36:27 +02001524 Talismane
1525 #Dependency
1526 #Morpho
1527
Akron821db3d2017-04-06 21:19:31 +02001528 TreeTagger
1529 #Morpho
1530 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001531
Akron83aedd32023-02-07 10:57:41 +01001532 UDPipe
1533 #Dependency
1534 #Morpho
1535
Akron821db3d2017-04-06 21:19:31 +02001536 XIP
1537 #Constituency
1538 #Morpho
1539 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001540
Akronc13a1702016-03-15 19:33:14 +01001541
1542More importers are in preparation.
1543New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1544See the built-in annotation importers as examples.
1545
Akronf73ffb62018-06-27 12:13:59 +02001546
Akron41e6c8b2021-10-14 20:22:18 +02001547=head1 METADATA SUPPORT
1548
1549L<KorAP::XML::Krill> has built-in importer for some meta data variants
1550developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1551
1552=over 2
1553
1554=item I5 - Meta data for all I5 files
1555
1556=item Sgbr - Meta data from the Schreibgebrauch project
1557
1558=item Gingko - Meta data from the Gingko project in addition to I5
1559
Akron2532f1b2023-05-15 13:41:24 +02001560=item ICC - Meta data for the ICC in addition to I5
1561
Akron41e6c8b2021-10-14 20:22:18 +02001562=back
1563
1564More importers are in preparation.
1565New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1566See the built-in meta data importers as examples.
1567
1568
Akron8f69d632020-01-15 16:58:11 +01001569=head1 About KorAP-XML
1570
1571KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1572data model (Bański et al. 2013), where text data are stored physically
1573separated from their interpretations (i.e. annotations).
1574A text document in KorAP-XML therefore consists of several files
1575containing primary data, metadata and annotations.
1576
1577The structure of a single KorAP-XML document can be as follows:
1578
1579 - data.xml
1580 - header.xml
1581 + base
1582 - tokens.xml
1583 - ...
1584 + struct
1585 - structure.xml
1586 - ...
1587 + corenlp
1588 - morpho.xml
1589 - constituency.xml
1590 - ...
1591 + tree_tagger
1592 - morpho.xml
1593 - ...
1594 - ...
1595
1596The C<data.xml> contains the primary data, the C<header.xml> contains
1597the metadata, and the annotation layers are stored in subfolders
1598like C<base>, C<struct> or C<corenlp>
1599(so-called "foundries"; Bański et al. 2013).
1600
1601Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001602(Lüngen and Sperberg-McQueen 2012). See the documentation in
1603L<KorAP::XML::Meta::I5> for translatable fields.
1604
1605Annotations correspond to a variant of the TEI-P5 feature structures
1606(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001607Annotation feature structures refer to character sequences of the primary text
1608inside the C<text> element of the C<data.xml>.
1609A single annotation containing the lemma of a token can have the following structure:
1610
1611 <span from="0" to="3">
1612 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1613 <f name="lex">
1614 <fs>
1615 <f name="lemma">zum</f>
1616 </fs>
1617 </f>
1618 </fs>
1619 </span>
1620
1621The C<from> and C<to> attributes are refering to the character span
1622in the primary text.
1623Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1624the structure may vary. See L<KorAP::XML::Annotation::*> for various
1625annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001626
1627Multiple KorAP-XML documents are organized on three levels following
1628the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1629corpus E<gt> document E<gt> text. On each level metadata information
1630can be stored, that C<korapxml2krill> will merge to a single metadata
1631object per text. A corpus is therefore structured as follows:
1632
1633 + <corpus>
1634 - header.xml
1635 + <document>
1636 - header.xml
1637 + <text>
1638 - data.xml
1639 - header.xml
1640 - ...
1641 - ...
1642
1643A single text can be identified by the concatenation of
1644the corpus identifier, the document identifier and the text identifier.
1645This identifier is called the text sigle
1646(e.g. a text with the identifier C<18486> in the document C<060> in the
1647corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1648
1649These corpora are often stored in zip files, with which C<korapxml2krill>
1650can deal with. Corpora may also be split in multiple zip archives
1651(e.g. one zip file per foundry), which is also supported (see C<--input>).
1652
1653Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1654in form of a test suite.
1655The resulting JSON format merges all annotation layers
1656based on a single token stream.
1657
1658=head2 References
1659
1660Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1661KorAP data model: first approximation, December.
1662
1663Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1664"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1665Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1666L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1667
1668Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1669"Robust corpus architecture: a new look at virtual collections and data access",
1670Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1671L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1672
1673Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1674Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1675"Towards an international standard on featurestructure representation",
1676Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1677pp. 373-376.
1678L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1679
1680Harald Lüngen and C. M. Sperberg-McQueen (2012):
1681"A TEI P5 Document Grammar for the IDS Text Model",
1682Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1683L<PDF|https://journals.openedition.org/jtei/pdf/508>
1684
1685TEI Consortium, eds:
1686"Feature Structures",
1687Guidelines for Electronic Text Encoding and Interchange.
1688L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1689
Akron941c1a62016-02-23 17:41:41 +01001690=head1 AVAILABILITY
1691
1692 https://github.com/KorAP/KorAP-XML-Krill
1693
1694
1695=head1 COPYRIGHT AND LICENSE
1696
Akrona3518372024-01-22 23:29:00 +01001697Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001698
Akron6882d7d2021-02-08 09:43:57 +01001699Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001700
Akrona76d8352016-10-27 16:27:32 +02001701Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001702
Akron6882d7d2021-02-08 09:43:57 +01001703L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001704Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001705L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001706member of the
Akronf1849aa2019-12-16 23:35:33 +01001707L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001708
1709This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001710L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001711
1712=cut