blob: 081c7cf2cde7872659a3db1ca96c48891425aa28 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Marc Kupietzb8c53822024-03-16 18:54:08 +0100174# 2024/03/20
175# - Added Spacy support.
176#
Akron941c1a62016-02-23 17:41:41 +0100177# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100178
Marc Kupietzb8c53822024-03-16 18:54:08 +0100179our $LAST_CHANGE = '2024/03/20';
Akron941c1a62016-02-23 17:41:41 +0100180our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100181our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100182our $VERSION_MSG = <<"VERSION";
183Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
184VERSION
185
Akron941c1a62016-02-23 17:41:41 +0100186# Parse comand
187my $cmd;
188our @ARGV;
189if ($ARGV[0] && index($ARGV[0], '-') != 0) {
190 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100191};
Akron63f20d42017-04-10 23:40:29 +0200192my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100193
Akron5f51d422016-08-16 16:26:43 +0200194my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200195
196# Configuration hash
197my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100198
Akron941c1a62016-02-23 17:41:41 +0100199# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000200GetOptions(
Akron08385f62016-03-22 20:37:04 +0100201 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200202 'input-base|ib=s' => \($cfg{input_base}),
203 'output|o=s' => \($cfg{output}),
204 'overwrite|w' => \($cfg{overwrite}),
205 'meta|m=s' => \($cfg{meta}),
206 'token|t=s' => \($cfg{token}),
207 'base-sentences|bs=s' => \($cfg{base_sentences}),
208 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
209 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
210 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100211 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100212 'skip|s=s' => \@skip,
213 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200214 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200215 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200216 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200217 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200218 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200219 'primary|p!' => sub {
220 warn 'Primary flag no longer supported!';
221 },
Akrona3518372024-01-22 23:29:00 +0100222 'quiet' => \($cfg{quiet}),
Akron6aed0562020-08-07 16:46:00 +0200223 'pretty|y' => sub {
224 warn 'Pretty flag no longer supported!';
225 },
Akronf8df2162020-08-07 15:03:39 +0200226 'jobs|j=i' => \($cfg{jobs}),
227 'koral|k=f' => \($cfg{koral}),
228 'to-tar' => \($cfg{to_tar}),
229 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
230 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
231 'sequential-extraction|se' => \($cfg{sequential_extraction}),
232 'cache-size|cs=s' => \($cfg{cache_size}),
233 'cache-delete|cd!' => \($cfg{cache_delete}),
234 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100235 'help|h' => sub {
236 pod2usage(
237 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200238 -verbose => 99,
239 -msg => $VERSION_MSG,
240 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100241 );
242 },
243 'version|v' => sub {
244 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200245 -verbose => 0,
246 -msg => $VERSION_MSG,
247 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100248 )
249 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000250);
251
Akrone512b7c2020-08-07 16:16:12 +0200252my %ERROR_HASH = (
253 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
254 -verbose => 99,
255 -msg => $VERSION_MSG,
256 -output => '-',
257 -exit => 1
258);
Akron63f20d42017-04-10 23:40:29 +0200259
Akronf8df2162020-08-07 15:03:39 +0200260# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200261if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200262 my %config;
263
Akronf8df2162020-08-07 15:03:39 +0200264 print "Reading config from $cfg_file\n";
265
Akron636aa112017-04-07 18:48:56 +0200266 Config::Simple->import_from($cfg_file, \%config);
267
Akronf8df2162020-08-07 15:03:39 +0200268 foreach (qw!output cache-size input-base token overwrite
269 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200270 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100271 non-verbal-tokens sequential-extraction
272 temporary-extract cache-init
Akrona3518372024-01-22 23:29:00 +0100273 koral extract-dir jobs quiet!) {
Akronf8df2162020-08-07 15:03:39 +0200274 my $underlined = $_ =~ tr/-/_/r;
275 if (!defined($cfg{$underlined}) && defined $config{$_}) {
276 $cfg{$underlined} = $config{$_};
277 };
Akron636aa112017-04-07 18:48:56 +0200278 };
279
280 # Skip
281 if (!scalar(@skip) && defined $config{'skip'}) {
282 @skip = split /\s*;\s*/, $config{'skip'} ;
283 };
284
285 # Sigle
286 if (!scalar(@sigle) && defined $config{'sigle'}) {
287 @sigle = split /\s*;\s*/, $config{'sigle'} ;
288 };
289
290 # Anno
291 if (!scalar(@anno) && defined $config{'anno'}) {
292 @anno = split /\s*;\s*/, $config{'anno'} ;
293 };
294};
295
Akronf8df2162020-08-07 15:03:39 +0200296# Init variables and set default values
297my $output = $cfg{output};
298my $input_base = $cfg{input_base};
299my $gzip = $cfg{gzip};
300my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100301my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200302my $token_base = $cfg{token} // 'OpenNLP#tokens';
303my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
304my $jobs = $cfg{jobs} // 0;
305my $cache_delete = $cfg{cache_delete} // 1;
306my $base_sentences = lc($cfg{base_sentences} // '');
307my $base_paragraphs = lc($cfg{base_paragraphs} // '');
308my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
309my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akrona3518372024-01-22 23:29:00 +0100310my $q = !!($cfg{quiet}) // 0;
Akron63f20d42017-04-10 23:40:29 +0200311
Akronf8df2162020-08-07 15:03:39 +0200312# Get tokenization basis
313my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200314
Akronf8df2162020-08-07 15:03:39 +0200315# Remove file extension
316$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100317
Akronf8df2162020-08-07 15:03:39 +0200318# Convert sigle to path construct
319s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
320
321my %skip;
322$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200323
Akronb9c33812020-10-21 16:19:35 +0200324Log::Any::Adapter->set(
325 'Stderr', log_level => uc($cfg{log} // 'ERROR')
326);
Akron63f20d42017-04-10 23:40:29 +0200327
Akron84b53ad2022-01-14 12:39:15 +0100328# Start log slimming
329if ($cmd && $cmd eq 'slimlog') {
330 require KorAP::XML::Log::Slim;
331
332 my $log_file = shift @ARGV;
333
334 if (-e $log_file) {
335
336 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
337
338 # Run log filter
339 $slimmer->slim_to;
340 }
341
342 else {
343 warn "Log file can't be found";
344 exit(1);
345 };
346
347 exit;
348};
349
350
Akronf8df2162020-08-07 15:03:39 +0200351if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
352 $log->error("Directory '$output' does not exist.");
353 exit 1;
354};
Akron63f20d42017-04-10 23:40:29 +0200355
Akron941c1a62016-02-23 17:41:41 +0100356# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100357pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000358
Akrone1dbc382016-07-08 22:24:52 +0200359# Gzip has no effect, if no output is given
360pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000361
Akron63f20d42017-04-10 23:40:29 +0200362# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200363if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200364
Akron63f20d42017-04-10 23:40:29 +0200365 # Remove all inputs
366 my $remove_next = 0;
367 @keep_argv = @{c(@keep_argv)->grep(
368 sub {
369 # Input flag
370 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
371 $remove_next = 1;
372 return 0;
373 }
374
375 # input value
376 elsif ($remove_next) {
377 $remove_next = 0;
378 return 0;
379 };
380
381 # Pass parameter
382 return 1;
383 }
384 )->to_array};
385
386
387 # Iterate over all inputs
388 foreach (@input) {
389
Akron081639e2017-04-21 19:01:39 +0200390 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200391 my $new_out = catdir($output, get_file_name_from_glob($_));
392
Akron486f9ab2017-04-22 23:25:19 +0200393 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200394 unless ($to_tar) {
395 if (make_path($new_out) == 0 && !-d $new_out) {
396 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200397 exit 1;
Akron081639e2017-04-21 19:01:39 +0200398 };
Akron63f20d42017-04-10 23:40:29 +0200399 };
400
401 # Create archive command
402 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
Akrona3518372024-01-22 23:29:00 +0100403 unless ($q) {
404 print "Start serial processing of $_ to $new_out\n";
405 print 'Command: ', join(' ', @archive_cmd), "\n";
406 };
Akron63f20d42017-04-10 23:40:29 +0200407
408 # Start archiving
409 system @archive_cmd;
410 };
411
Akron3abc03e2017-06-29 16:23:35 +0200412 exit;
Akron63f20d42017-04-10 23:40:29 +0200413};
414
Akron5c602cb2020-08-07 17:00:52 +0200415# Define supported (and preinstalled) transformation modules
416my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100417push(@layers, ['Base', 'Sentences']) unless $base_sentences;
418push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200419
420# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200421push(@layers, ['Connexor', 'Morpho'],
422 ['Connexor', 'Syntax'],
423 ['Connexor', 'Phrase'],
424 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200425
426# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200427push(@layers,
428 ['CoreNLP', 'NamedEntities'],
429 ['CoreNLP', 'Sentences'],
430 ['CoreNLP', 'Morpho'],
431 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200432
Akronce125b62017-06-19 11:54:36 +0200433# CMC
434push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100435
Akrone1dbc382016-07-08 22:24:52 +0200436# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100437my @dereko_attr = ();
438if ($base_sentences eq 'dereko#structure') {
439 push @dereko_attr, 'sentences';
440};
441if ($base_paragraphs eq 'dereko#structure') {
442 push @dereko_attr, 'paragraphs';
443};
Akron636bd9c2017-02-09 17:13:00 +0100444
Akron41ac10b2017-02-08 22:47:25 +0100445if ($base_pagebreaks eq 'dereko#structure') {
446 push @dereko_attr, 'pagebreaks';
447};
448
449if ($dereko_attr[0]) {
450 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100451}
452else {
453 push(@layers, ['DeReKo', 'Structure']);
454};
Akrone1dbc382016-07-08 22:24:52 +0200455
Akron57510c12019-01-04 14:58:53 +0100456# DGD
457push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100458if ($base_sentences eq 'dgd#structure') {
459 push(@layers, ['DGD', 'Structure', 'base-sentence']);
460}
Akron57510c12019-01-04 14:58:53 +0100461
462# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200463push(@layers,
464 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100465
Akronabb36902021-10-11 15:51:06 +0200466# Gingko
467push(@layers,
468 ['Gingko', 'Morpho']);
469
Akrone1dbc382016-07-08 22:24:52 +0200470# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200471push(@layers,
472 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200473
Akronea1aed52018-07-19 14:43:34 +0200474# HNC
Akron5c602cb2020-08-07 17:00:52 +0200475push(@layers,
476 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200477
Akron4c679192018-01-16 17:41:49 +0100478# LWC
Akron5c602cb2020-08-07 17:00:52 +0200479push(@layers,
480 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100481
Akrone1dbc382016-07-08 22:24:52 +0200482# Malt
Akron5c602cb2020-08-07 17:00:52 +0200483push(@layers,
484 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200485
Akron57510c12019-01-04 14:58:53 +0100486# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200487push(@layers,
488 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200489
490# Mate
Akron5c602cb2020-08-07 17:00:52 +0200491push(@layers,
492 ['Mate', 'Morpho'],
493 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200494
Akron57510c12019-01-04 14:58:53 +0100495# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200496push(@layers,
497 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100498
Akron88d063a2022-03-21 15:10:01 +0100499# NKJP
500push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200501 ['NKJP', 'Morpho'],
502 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100503
Akrone1dbc382016-07-08 22:24:52 +0200504# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200505push(@layers,
506 ['OpenNLP', 'Morpho'],
507 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200508
Akron07e24772020-04-23 14:00:54 +0200509# Redewiedergabe
510push(@layers, ['RWK', 'Morpho']);
511if ($base_sentences eq 'rwk#structure') {
512 push(@layers, ['RWK', 'Structure']);
513};
514
Akrone1dbc382016-07-08 22:24:52 +0200515# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200516push(@layers,
517 ['Sgbr', 'Lemma'],
518 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200519
Marc Kupietzb8c53822024-03-16 18:54:08 +0100520# Spacy
521push(@layers,
522 ['Spacy', 'Morpho']);
523
Akron7d5e6382019-08-08 16:36:27 +0200524# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200525push(@layers,
526 ['Talismane', 'Dependency'],
527 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200528
Akrone1dbc382016-07-08 22:24:52 +0200529# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200530push(@layers,
531 ['TreeTagger', 'Morpho'],
532 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200533
Marc Kupietz400590b2022-12-23 16:02:36 +0100534# UDPipe
535push(@layers,
536 ['UDPipe', 'Morpho'],
537 ['UDPipe', 'Dependency']);
538
Akrone1dbc382016-07-08 22:24:52 +0200539# XIP
Akron5c602cb2020-08-07 17:00:52 +0200540push(@layers,
541 ['XIP', 'Morpho'],
542 ['XIP', 'Constituency'],
543 ['XIP', 'Sentences'],
544 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200545
Akron4fa37c32017-01-20 14:43:10 +0100546
Akrone1dbc382016-07-08 22:24:52 +0200547# Check filters
548my @filtered_anno;
549if ($skip{'#all'}) {
550 foreach (@anno) {
551 push @filtered_anno, [ split('#', $_) ];
552 };
553}
554
555# Add all annotations that are not skipped
556else {
557 # Add to index file - respect skipping
558 foreach my $info (@layers) {
559 # Skip if Foundry or Foundry#Layer should be skipped
560 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
561 push @filtered_anno, $info;
562 };
563 };
564};
565
Akrone1dbc382016-07-08 22:24:52 +0200566
567# TODO: This should not be initialized for batch
568my $cache = Cache::FastMmap->new(
569 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200570 cache_size => ($cfg{cache_size} // '50m'),
571 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200572);
573
Akron03b24db2016-08-16 20:54:32 +0200574# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200575my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200576 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200577 meta_type => $cfg{meta},
578 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200579 foundry => $token_base_foundry,
580 layer => $token_base_layer,
581 gzip => $gzip,
582 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200583 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100584 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200585 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200586 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
587 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200588);
589
Akrone512b7c2020-08-07 16:16:12 +0200590# Auto adjust jobs
591if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100592 my $cores = 1;
593 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
594 $cores = Sys::Info->new->device('CPU')->count;
595 }
596 else {
597 $log->warn("Unable to determine number of cores");
598 };
599
Akrone512b7c2020-08-07 16:16:12 +0200600 $jobs = ceil(5 * $cores);
601 $log->info("Run using $jobs jobs on $cores cores");
602};
603
604
Akron63f20d42017-04-10 23:40:29 +0200605# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200606if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200607
Akron821db3d2017-04-06 21:19:31 +0200608 my @new_input = ();
609
610 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200611 foreach my $wild_card (@input) {
612
613 # Prefix with input root
614 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
615
616 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200617 };
618
Akron63f20d42017-04-10 23:40:29 +0200619 # Sort files by length
620 @input = sort { length($a) <=> length($b) } @new_input;
621
Akrona3518372024-01-22 23:29:00 +0100622 print 'Input is ' . join(', ', @input)."\n" unless $q;
Akron821db3d2017-04-06 21:19:31 +0200623};
624
625
Akron941c1a62016-02-23 17:41:41 +0100626# Process a single file
627unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100628 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000629
Akron941c1a62016-02-23 17:41:41 +0100630 BEGIN {
631 $main::TIME = Benchmark->new;
632 $main::LAST_STOP = Benchmark->new;
633 };
634
635 sub stop_time {
636 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200637 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100638 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200639 timestr(timediff($new, $main::LAST_STOP)) .
640 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
641 );
Akron941c1a62016-02-23 17:41:41 +0100642 $main::LAST_STOP = $new;
643 };
644
645 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200646 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100647
Akron7d4cdd82016-08-17 21:39:45 +0200648 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200649 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100650
Akron11c80302016-03-18 19:44:43 +0100651 # Delete cache file
652 unlink($cache_file) if $cache_delete;
653
Akron5f51d422016-08-16 16:26:43 +0200654 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200655 exit;
Akron81500102017-04-07 20:45:44 +0200656};
657
Nils Diewald59094f22014-11-05 18:20:50 +0000658
Akrone10ad322016-02-27 10:54:26 +0100659# Extract XML files
Akron81500102017-04-07 20:45:44 +0200660if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100661
Akrond5643ad2017-07-04 20:27:13 +0200662 # Output is required
663 pod2usage(%ERROR_HASH) unless $output;
664
Akron7d4cdd82016-08-17 21:39:45 +0200665 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200666 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100667
Akron7d4cdd82016-08-17 21:39:45 +0200668 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100669 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200670 $log->error("Unzip is not installed or incompatible.");
671 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100672 };
673
Akronb0c88db2016-06-29 16:33:18 +0200674 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200675 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200676
Akron31a08cb2019-02-20 20:43:26 +0100677 # Will set @sigle
678 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200679
Akrone10ad322016-02-27 10:54:26 +0100680 # Iterate over all given sigles and extract
681 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100682
Akrona3518372024-01-22 23:29:00 +0100683 unless ($q) {
684 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200685
Akrona3518372024-01-22 23:29:00 +0100686 # TODO: Make this OS independent
687 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100688
Akrona3518372024-01-22 23:29:00 +0100689 # TODO:
690 # - prefix???
691 $archive->extract_sigle(0, [$_], $output, $jobs)
692 ? '' : 'not '
693 );
694 print "extracted.\n";
695 } else {
696 $archive->extract_sigle(1, [$_], $output, $jobs);
697 }
Akrone10ad322016-02-27 10:54:26 +0100698 };
Akronb0c88db2016-06-29 16:33:18 +0200699 }
Akron7d4cdd82016-08-17 21:39:45 +0200700
701 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200702 else {
703 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200704 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100705 };
706}
707
Akron81500102017-04-07 20:45:44 +0200708
Akron941c1a62016-02-23 17:41:41 +0100709# Process an archive
710elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000711
Akron81500102017-04-07 20:45:44 +0200712 my $archive_output;
713
714 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100715 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200716
717 # Create new archive object
718 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
719
720 # Check zip capabilities
721 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200722 $log->error("Unzip is not installed or incompatible.");
723 exit 1;
Akron81500102017-04-07 20:45:44 +0200724 };
725
726 # Add further annotation archived
727 $archive->attach($_) foreach @input[1..$#input];
728
729 # Create a temporary directory
730 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200731 $extract_dir = tempdir(CLEANUP => 0);
Akrona3518372024-01-22 23:29:00 +0100732 print "Temporarily extract to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200733 };
734
Akron63f20d42017-04-10 23:40:29 +0200735 # Add some random extra to avoid clashes with multiple archives
736 $extract_dir = catdir($extract_dir, random_string('cccccc'));
737
Akron31a08cb2019-02-20 20:43:26 +0100738 # Extract to temporary directory
Akrona3518372024-01-22 23:29:00 +0100739 if ($archive->extract_all($q, $extract_dir, $sequential_extraction ? 1: $jobs)) {
740 print "Extract sequentially to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200741 @input = ($extract_dir);
742 }
743 else {
744 $log->error('Unable to extract from primary archive ' . $input[0] .
745 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200746 exit 1;
Akron81500102017-04-07 20:45:44 +0200747 };
748 }
749
750 # Can't create archive object
751 else {
752 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200753 exit 1;
Akron81500102017-04-07 20:45:44 +0200754 };
755 };
756
Akron7d4cdd82016-08-17 21:39:45 +0200757 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100758 my $pool = Parallel::ForkManager->new($jobs);
759
Akron7d4cdd82016-08-17 21:39:45 +0200760 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100761 my $iter = 1; # Current text in process
762
Akronda3097e2017-04-23 19:53:57 +0200763 my $tar_archive;
764 my $output_dir = $output;
765 my $tar_fh;
766
767 # Initialize tar archive
768 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200769
770 # Set output name
771 my $tar_file = $output;
772 unless ($tar_file =~ /\.tar$/) {
773 $tar_file .= '.tar';
774 };
775
776 # Initiate the tar file
Akrona3518372024-01-22 23:29:00 +0100777 print "Writing to file $tar_file\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200778 $tar_fh = IO::File->new($tar_file, 'w');
779 $tar_fh->binmode(1);
780
Akroneb370a02022-02-24 13:33:40 +0100781 # Use tar builder for archiving
782 if (eval("use Archive::Tar::Builder; 1;")) {
783 $tar_archive = Archive::Tar::Builder->new(
784 ignore_errors => 1
785 );
786
787 # Set handle
788 $tar_archive->set_handle($tar_fh);
789 }
790
791 # Fallback solution
792 else {
793 $tar_archive = KorAP::XML::TarBuilder->new(
794 $tar_fh
795 );
796 };
Akronda3097e2017-04-23 19:53:57 +0200797
798 # Output to temporary directory
799 $output_dir = File::Temp->newdir;
800 };
801
Akron941c1a62016-02-23 17:41:41 +0100802 # Report on fork message
803 $pool->run_on_finish (
804 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200805 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100806 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200807
Akrona3518372024-01-22 23:29:00 +0100808 unless ($q) {
809 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
810 $iter . "/$count]" .
811 ($code ? " $code" : '') .
812 ' ' . $data->[0] . "\n";
813 };
814 $iter++;
Akronda3097e2017-04-23 19:53:57 +0200815
816 if (!$code && $to_tar && $data->[2]) {
817 my $filename = $data->[2];
818
819 # Lock filehandle
820 if (flock($tar_fh, LOCK_EX)) {
821
Akron9a062ce2017-07-04 19:12:05 +0200822 my $clean_file = fileparse($filename);
823
Akronda3097e2017-04-23 19:53:57 +0200824 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200825 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200826 unlink $filename;
827
828 # Unlock filehandle
829 flock($tar_fh, LOCK_UN);
830 }
831 else {
832 $log->warn("Unable to add $filename to archive");
833 };
834 };
835
Akron4c0cf312016-10-15 16:42:09 +0200836 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100837 }
838 );
839
840 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200841 my $temp;
Akrona3518372024-01-22 23:29:00 +0100842 print "Reading data ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100843
Akron7d4cdd82016-08-17 21:39:45 +0200844 # unless (Cache::FastMmap->new(
845 # share_file => $cache_file,
846 # cache_size => $cache_size,
847 # init_file => $cache_init
848 # )) {
849 # print "Unable to intialize cache '$cache_file'\n\n";
850 # exit(1);
851 # };
Akron11c80302016-03-18 19:44:43 +0100852
Akron486f9ab2017-04-22 23:25:19 +0200853
Akron941c1a62016-02-23 17:41:41 +0100854 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100855 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200856 # TODO:
857 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100858 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100859 my @dirs;
860 my $dir;
861
Akron7d4cdd82016-08-17 21:39:45 +0200862 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100863 while (1) {
864 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200865 push @dirs, $dir;
866 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100867 };
868 last unless $it->next;
869 };
870
Akrona3518372024-01-22 23:29:00 +0100871 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100872 $t = Benchmark->new;
873 $count = scalar @dirs;
874
875 DIRECTORY_LOOP:
876 for (my $i = 0; $i < $count; $i++) {
877
Akrone1dbc382016-07-08 22:24:52 +0200878 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200879 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200880 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200881 );
Akron941c1a62016-02-23 17:41:41 +0100882
883 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200884 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200885
Akron13d56622016-10-31 14:54:49 +0100886 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200887 $pool->finish(
888 0,
Akronda3097e2017-04-23 19:53:57 +0200889 [
890 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
891 undef,
892 $filename
893 ]
Akron486f9ab2017-04-22 23:25:19 +0200894 );
Akron3ec48972016-08-17 23:24:52 +0200895 }
896 else {
Akron4c0cf312016-10-15 16:42:09 +0200897 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200898 };
Akron941c1a62016-02-23 17:41:41 +0100899 };
900 }
901
902 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200903 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200904
Akron941c1a62016-02-23 17:41:41 +0100905 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200906 $log->error("Unzip is not installed or incompatible.");
907 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100908 };
909
Akron08385f62016-03-22 20:37:04 +0100910 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200911 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100912
Akron31a08cb2019-02-20 20:43:26 +0100913 # Get sigles to extract
914 my $prefix = set_sigle($archive);
915
Akrona3518372024-01-22 23:29:00 +0100916 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100917 $t = Benchmark->new;
918 my @dirs = $archive->list_texts;
919 $count = scalar @dirs;
920
921 ARCHIVE_LOOP:
922 for (my $i = 0; $i < $count; $i++) {
923
924 # Split path information
925 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
926
Akrone1dbc382016-07-08 22:24:52 +0200927 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200928 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200929 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200930 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200931 catfile($corpus, $doc, $text)
932 . '.json' . ($gzip ? '.gz' : '')
933 )
Akrone1dbc382016-07-08 22:24:52 +0200934 );
Akron941c1a62016-02-23 17:41:41 +0100935
936 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200937 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100938
Akron4c0cf312016-10-15 16:42:09 +0200939 # Create temporary file
940 $temp = File::Temp->newdir;
941
Akronbdf434a2016-10-24 17:42:07 +0200942 # TODO: Check if $filename exist at the beginning,
943 # because extraction can be horrible slow!
944
Akron941c1a62016-02-23 17:41:41 +0100945 # Extract from archive
Akrona3518372024-01-22 23:29:00 +0100946 if ($archive->extract_sigle($q, [join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100947
Akron7d4cdd82016-08-17 21:39:45 +0200948 # Create corpus directory
949 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100950
Akron7d4cdd82016-08-17 21:39:45 +0200951 # Temporary directory
952 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100953
Akron7d4cdd82016-08-17 21:39:45 +0200954 # Write file
Akron13d56622016-10-31 14:54:49 +0100955 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200956
Akron4c0cf312016-10-15 16:42:09 +0200957 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100958 $pool->finish(
959 0,
Akronda3097e2017-04-23 19:53:57 +0200960 [
961 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
962 $temp,
963 $filename
964 ]
Akron13d56622016-10-31 14:54:49 +0100965 );
Akron7d4cdd82016-08-17 21:39:45 +0200966 }
967 else {
Akron4c0cf312016-10-15 16:42:09 +0200968 # Delete temporary file
969 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200970 };
Akron941c1a62016-02-23 17:41:41 +0100971 }
Akron7d4cdd82016-08-17 21:39:45 +0200972
973 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100974 else {
Akron4c0cf312016-10-15 16:42:09 +0200975 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100976 };
977 };
978 }
979
980 else {
Akrona3518372024-01-22 23:29:00 +0100981 print "Input is neither a directory nor an archive.\n\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100982 };
983
984 $pool->wait_all_children;
985
Akron11c80302016-03-18 19:44:43 +0100986 # Delete cache file
987 unlink($cache_file) if $cache_delete;
988
Akronda3097e2017-04-23 19:53:57 +0200989 # Close tar filehandle
990 if ($to_tar && $tar_fh) {
991 $tar_archive->finish;
992 $tar_fh->close;
Akrona3518372024-01-22 23:29:00 +0100993 print "Wrote to tar archive.\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200994 };
Akrona3518372024-01-22 23:29:00 +0100995 unless ($q) {
996 print timestr(timediff(Benchmark->new, $t))."\n";
997 print "Done.\n";
998 };
Akron81500102017-04-07 20:45:44 +0200999};
Akron941c1a62016-02-23 17:41:41 +01001000
Nils Diewald2db9ad02013-10-29 19:26:43 +00001001
Akron31a08cb2019-02-20 20:43:26 +01001002# For an archive, this will create the list
1003# of all sigles to process
1004sub set_sigle {
1005 my $archive = shift;
1006
1007 my $prefix = 1;
1008 my @dirs = ();
1009
1010 # No sigles given
1011 unless (@sigle) {
1012
1013 # Get files
1014 foreach ($archive->list_texts) {
1015
1016 push @dirs, $_;
1017
1018 # Split path information
1019 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1020
1021 # TODO: Make this OS independent
1022 push @sigle, join '/', $corpus, $doc, $text;
1023 };
1024 }
1025
1026 # Check sigle for doc sigles
1027 else {
1028 my @new_sigle;
1029
1030 my $prefix_check = 0;
1031
1032 # Iterate over all sigle
1033 foreach (@sigle) {
1034
1035 # Sigle is a doc sigle
1036 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1037
Akrona3518372024-01-22 23:29:00 +01001038 print "$_ ..." unless $q;
Akron31a08cb2019-02-20 20:43:26 +01001039 # Check if a prefix is needed
1040 unless ($prefix_check) {
1041
Akrona3518372024-01-22 23:29:00 +01001042 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001043 print " with prefix ...";
1044 };
1045 $prefix_check = 1;
1046 };
1047
Akrona3518372024-01-22 23:29:00 +01001048 unless ($q) {
1049 print "\n";
Akron31a08cb2019-02-20 20:43:26 +01001050
Akrona3518372024-01-22 23:29:00 +01001051 print '... ' . (
1052 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
Akron955b75b2019-02-21 14:28:41 +01001053 ? '' : 'not '
Akrona3518372024-01-22 23:29:00 +01001054 );
1055 print "extracted.\n";
1056 }
1057 else {
1058 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
1059 };
Akron31a08cb2019-02-20 20:43:26 +01001060 }
1061
1062 # Sigle is a text sigle
1063 else {
1064 push @new_sigle, $_;
1065
1066 unless ($prefix_check) {
1067
Akrona3518372024-01-22 23:29:00 +01001068 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001069 print " with prefix ...";
1070 };
1071 $prefix_check = 1;
1072 };
1073 };
1074 };
1075 @sigle = @new_sigle;
1076 };
1077
1078 return $prefix;
1079};
1080
1081
Akron63f20d42017-04-10 23:40:29 +02001082# Cleanup temporary extraction directory
1083if ($extract_dir) {
1084 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001085 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001086};
1087
1088
1089print "\n";
1090
Nils Diewald2db9ad02013-10-29 19:26:43 +00001091__END__
Akron941c1a62016-02-23 17:41:41 +01001092
1093=pod
1094
1095=encoding utf8
1096
1097=head1 NAME
1098
Akron42f48c12020-02-14 13:08:13 +01001099korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001100
1101
1102=head1 SYNOPSIS
1103
Akrona76d8352016-10-27 16:27:32 +02001104 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001105
Akron2fd402b2016-10-27 21:26:48 +02001106
Akron941c1a62016-02-23 17:41:41 +01001107=head1 DESCRIPTION
1108
1109L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1110compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001111The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001112
1113
1114=head1 INSTALLATION
1115
1116The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1117
Akron58b4c8d2024-03-20 14:16:18 +01001118 cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001119
Akronc13a1702016-03-15 19:33:14 +01001120In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001121be available on your command line immediately.
Akron8ce23f72023-12-13 15:48:49 +01001122Minimum requirement for L<KorAP::XML::Krill> is Perl 5.32.
Akroneb370a02022-02-24 13:33:40 +01001123Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1124Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001125In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001126
1127=head1 ARGUMENTS
1128
Akron58b4c8d2024-03-20 14:16:18 +01001129 korapxml2krill -z --input <directory> --output <filename>
Akrona76d8352016-10-27 16:27:32 +02001130
1131Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001132It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001133
Akron941c1a62016-02-23 17:41:41 +01001134=over 2
1135
1136=item B<archive>
1137
Akron58b4c8d2024-03-20 14:16:18 +01001138 korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001139
Akron2fd402b2016-10-27 21:26:48 +02001140Converts an archive of KorAP-XML documents. It expects a directory
1141(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001142
1143=item B<extract>
1144
Akron58b4c8d2024-03-20 14:16:18 +01001145 korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Akrona76d8352016-10-27 16:27:32 +02001146
1147Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001148
Akron63f20d42017-04-10 23:40:29 +02001149=item B<serial>
1150
Akron58b4c8d2024-03-20 14:16:18 +01001151 korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Akron63f20d42017-04-10 23:40:29 +02001152
1153Convert archives sequentially. The inputs are not merged but treated
1154as they are (so they may be premerged or globs).
1155the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001156are created based on the archive name. In case the C<--to-tar> flag is given,
1157the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001158
1159
Akron84b53ad2022-01-14 12:39:15 +01001160=item B<slimlog>
1161
Akron58b4c8d2024-03-20 14:16:18 +01001162 korapxml2krill slimlog <logfile> > <logfile-slim>
Akron84b53ad2022-01-14 12:39:15 +01001163
1164Filters out all useless aka succesfull information from logs, to simplify
1165log checks. Expects no further options.
1166
1167
Akron941c1a62016-02-23 17:41:41 +01001168=back
1169
1170
1171=head1 OPTIONS
1172
1173=over 2
1174
Akrona76d8352016-10-27 16:27:32 +02001175=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001176
Akrona76d8352016-10-27 16:27:32 +02001177Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001178
Akron7606afa2016-10-25 16:23:49 +02001179Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001180document, while C<archive> expects a KorAP-XML corpus folder or a zip
1181file to batch process multiple files.
1182C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001183
Akrona76d8352016-10-27 16:27:32 +02001184C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001185that the first archive listed contains all primary data files
1186and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001187
Akron7606afa2016-10-25 16:23:49 +02001188 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001189
Akron821db3d2017-04-06 21:19:31 +02001190Input may also be defined using BSD glob wildcards.
1191
1192 -i 'file/news*.zip'
1193
1194The extended input array will be sorted in length order, so the shortest
1195path needs to contain all primary data files and all meta data files.
1196
Akron0c3e3752016-06-28 15:55:53 +02001197(The directory structure follows the base directory format,
1198that may include a C<.> root folder.
1199In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001200need to be passed with a hash sign in front of the archive's name.
1201This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001202
Akron7606afa2016-10-25 16:23:49 +02001203To support zip files, a version of C<unzip> needs to be installed that is
1204compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001205
Akron7606afa2016-10-25 16:23:49 +02001206B<The root folder switch using the hash sign is experimental and
1207may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001208
Akronf73ffb62018-06-27 12:13:59 +02001209
Akron63f20d42017-04-10 23:40:29 +02001210=item B<--input-base|-ib> <directory>
1211
1212The base directory for inputs.
1213
1214
Akron941c1a62016-02-23 17:41:41 +01001215=item B<--output|-o> <directory|file>
1216
1217Output folder for archive processing or
1218document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001219writes to C<STDOUT> by default
1220(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001221
1222=item B<--overwrite|-w>
1223
1224Overwrite files that already exist.
1225
Akronf73ffb62018-06-27 12:13:59 +02001226
Akron3741f8b2016-12-21 19:55:21 +01001227=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001228
1229Define the default tokenization by specifying
1230the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001231of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001232This will directly take the file instead of running
1233the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001234
Akron3741f8b2016-12-21 19:55:21 +01001235
1236=item B<--base-sentences|-bs> <foundry>#<layer>
1237
1238Define the layer for base sentences.
1239If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001240Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1241layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001242
1243 Defaults to unset.
1244
1245
1246=item B<--base-paragraphs|-bp> <foundry>#<layer>
1247
1248Define the layer for base paragraphs.
1249If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001250Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1251layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001252
1253 Defaults to unset.
1254
1255
Akron41ac10b2017-02-08 22:47:25 +01001256=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1257
1258Define the layer for base pagebreaks.
1259Currently C<DeReKo#Structure> is the only layer supported.
1260
1261 Defaults to unset.
1262
1263
Akron941c1a62016-02-23 17:41:41 +01001264=item B<--skip|-s> <foundry>[#<layer>]
1265
Akronf7ad89e2016-03-16 18:22:47 +01001266Skip specific annotations by specifying the foundry
1267(and optionally the layer with a C<#>-prefix),
1268e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001269Can be set multiple times.
1270
Akronf73ffb62018-06-27 12:13:59 +02001271
Akronc13a1702016-03-15 19:33:14 +01001272=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001273
Akronf7ad89e2016-03-16 18:22:47 +01001274Convert specific annotations by specifying the foundry
1275(and optionally the layer with a C<#>-prefix),
1276e.g. C<Mate> or C<Mate#Morpho>.
1277Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001278
Akronf73ffb62018-06-27 12:13:59 +02001279
Akroned9baf02019-01-22 17:03:25 +01001280=item B<--non-word-tokens|-nwt>
1281
1282Tokenize non-word tokens like word tokens (defined as matching
1283C</[\d\w]/>). Useful to treat punctuations as tokens.
1284
1285 Defaults to unset.
1286
Akronf1849aa2019-12-16 23:35:33 +01001287
1288=item B<--non-verbal-tokens|-nvt>
1289
1290Tokenize non-verbal tokens marked as in the primary data as
1291the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1292
1293 Defaults to unset.
1294
1295
Akron941c1a62016-02-23 17:41:41 +01001296=item B<--jobs|-j>
1297
1298Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001299for archive processing.
Akron11c80302016-03-18 19:44:43 +01001300Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001301
Akrona472a242023-02-13 13:46:30 +01001302If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001303also apply to extraction.
1304
Akronc11f7982017-02-21 21:20:14 +01001305Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001306times the number of available cores, in case L<Sys::Info>
1307is available.
Akronf7ad89e2016-03-16 18:22:47 +01001308This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001309
Akronf73ffb62018-06-27 12:13:59 +02001310
Akron263274c2019-02-07 09:48:30 +01001311=item B<--koral|-k>
1312
1313Version of the output format. Supported versions are:
1314C<0> for legacy serialization, C<0.03> for serialization
1315with metadata fields as key-values on the root object,
1316C<0.4> for serialization with metadata fields as a list
1317of C<"@type":"koral:field"> objects.
1318
1319Currently defaults to C<0.03>.
1320
1321
Akron9ec88872017-04-12 16:29:06 +02001322=item B<--sequential-extraction|-se>
1323
1324Flag to indicate, if the C<jobs> value also applies to extraction.
1325Some systems may have problems with extracting multiple archives
1326to the same folder at the same time.
1327Can be flagged using C<--no-sequential-extraction> as well.
1328Defaults to C<false>.
1329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akron35db6e32016-03-17 22:42:22 +01001331=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001332
Akron35db6e32016-03-17 22:42:22 +01001333Define the metadata parser to use. Defaults to C<I5>.
1334Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1335This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001336
Akronf73ffb62018-06-27 12:13:59 +02001337
Akron941c1a62016-02-23 17:41:41 +01001338=item B<--gzip|-z>
1339
Akronf7ad89e2016-03-16 18:22:47 +01001340Compress the output.
1341Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001342
Akronf73ffb62018-06-27 12:13:59 +02001343
Akron11c80302016-03-18 19:44:43 +01001344=item B<--cache|-c>
1345
1346File to mmap a cache (using L<Cache::FastMmap>).
1347Defaults to C<korapxml2krill.cache> in the calling directory.
1348
Akronf73ffb62018-06-27 12:13:59 +02001349
Akron11c80302016-03-18 19:44:43 +01001350=item B<--cache-size|-cs>
1351
1352Size of the cache. Defaults to C<50m>.
1353
Akronf73ffb62018-06-27 12:13:59 +02001354
Akron11c80302016-03-18 19:44:43 +01001355=item B<--cache-init|-ci>
1356
1357Initialize cache file.
1358Can be flagged using C<--no-cache-init> as well.
1359Defaults to C<true>.
1360
Akronf73ffb62018-06-27 12:13:59 +02001361
Akron11c80302016-03-18 19:44:43 +01001362=item B<--cache-delete|-cd>
1363
1364Delete cache file after processing.
1365Can be flagged using C<--no-cache-delete> as well.
1366Defaults to C<true>.
1367
Akronf73ffb62018-06-27 12:13:59 +02001368
Akron636aa112017-04-07 18:48:56 +02001369=item B<--config|-cfg>
1370
1371Configure the parameters of your call in a file
1372of key-value pairs with whitespace separator
1373
1374 overwrite 1
1375 token DeReKo#Structure
1376 ...
1377
1378Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001379C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001380C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001381C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001382C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001383C<base-sentences>, C<base-paragraphs>,
1384C<base-pagebreaks>,
1385C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001386(semicolon separated), C<anno> (semicolon separated).
1387
Akronf73ffb62018-06-27 12:13:59 +02001388Configuration parameters will always be overwritten by
1389passed parameters.
1390
1391
Akron81500102017-04-07 20:45:44 +02001392=item B<--temporary-extract|-te>
1393
Akrona472a242023-02-13 13:46:30 +01001394Only valid for the C<archive> and C<serial>
1395commands.
Akron81500102017-04-07 20:45:44 +02001396
1397This will first extract all files into a
1398directory and then will archive.
1399If the directory is given as C<:temp:>,
1400a temporary directory is used.
1401This is especially useful to avoid
1402massive unzipping and potential
1403network latency.
Akron636aa112017-04-07 18:48:56 +02001404
Akronf73ffb62018-06-27 12:13:59 +02001405
Akronc93a0802019-07-11 15:48:34 +02001406=item B<--to-tar>
1407
1408Only valid for the C<archive> command.
1409
1410Writes the output into a tar archive.
1411
1412
Akrone10ad322016-02-27 10:54:26 +01001413=item B<--sigle|-sg>
1414
Akron20807582016-10-26 17:11:34 +02001415Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001416Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001417I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001418Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001419In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001420On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001421
Akron64f7fae2022-07-27 12:45:33 +02001422=item B<--lang>
1423
1424Preferred language for metadata fields. In case multiple titles are
1425given (on any level) with different C<xml:lang> attributes,
1426the language given is preferred.
1427Because titles may have different sources and different priorities,
1428non-specific language titles may still be preferred in case the title
1429source has a higher priority.
1430
Akronf73ffb62018-06-27 12:13:59 +02001431
Akron941c1a62016-02-23 17:41:41 +01001432=item B<--log|-l>
1433
Akronb9c33812020-10-21 16:19:35 +02001434The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001435
Akronf73ffb62018-06-27 12:13:59 +02001436
Akrona3518372024-01-22 23:29:00 +01001437=item B<--quiet>
1438
1439Silence all information (non-log) outputs.
1440
1441
Akron941c1a62016-02-23 17:41:41 +01001442=item B<--help|-h>
1443
Akron42f48c12020-02-14 13:08:13 +01001444Print help information.
Akron941c1a62016-02-23 17:41:41 +01001445
Akronf73ffb62018-06-27 12:13:59 +02001446
Akron941c1a62016-02-23 17:41:41 +01001447=item B<--version|-v>
1448
1449Print version information.
1450
1451=back
1452
Akronf73ffb62018-06-27 12:13:59 +02001453
Akronc13a1702016-03-15 19:33:14 +01001454=head1 ANNOTATION SUPPORT
1455
1456L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1457developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1458The base foundry with paragraphs, sentences, and the text element are mandatory for
1459L<Krill|https://github.com/KorAP/Krill>.
1460
Akron821db3d2017-04-06 21:19:31 +02001461 Base
1462 #Paragraphs
1463 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001464
Akron821db3d2017-04-06 21:19:31 +02001465 Connexor
1466 #Morpho
1467 #Phrase
1468 #Sentences
1469 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001470
Akron821db3d2017-04-06 21:19:31 +02001471 CoreNLP
1472 #Constituency
1473 #Morpho
1474 #NamedEntities
1475 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001476
Akronce125b62017-06-19 11:54:36 +02001477 CMC
1478 #Morpho
1479
Akron821db3d2017-04-06 21:19:31 +02001480 DeReKo
1481 #Structure
Akronc13a1702016-03-15 19:33:14 +01001482
Akron57510c12019-01-04 14:58:53 +01001483 DGD
1484 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001485 #Structure
Akron57510c12019-01-04 14:58:53 +01001486
Akron821db3d2017-04-06 21:19:31 +02001487 DRuKoLa
1488 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001489
Akron821db3d2017-04-06 21:19:31 +02001490 Glemm
1491 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001492
Akronabb36902021-10-11 15:51:06 +02001493 Gingko
1494 #Morpho
1495
Akronea1aed52018-07-19 14:43:34 +02001496 HNC
1497 #Morpho
1498
Akron4c679192018-01-16 17:41:49 +01001499 LWC
1500 #Dependency
1501
Akron821db3d2017-04-06 21:19:31 +02001502 Malt
1503 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001504
Akron821db3d2017-04-06 21:19:31 +02001505 MarMoT
1506 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001507
Akron821db3d2017-04-06 21:19:31 +02001508 Mate
1509 #Dependency
1510 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001511
Akron821db3d2017-04-06 21:19:31 +02001512 MDParser
1513 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001514
Akrone85a7762022-07-22 08:05:03 +02001515 NKJP
1516 #Morpho
1517 #NamedEntities
1518
Akron821db3d2017-04-06 21:19:31 +02001519 OpenNLP
1520 #Morpho
1521 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001522
Akron07e24772020-04-23 14:00:54 +02001523 RWK
1524 #Morpho
1525 #Structure
1526
Akron821db3d2017-04-06 21:19:31 +02001527 Sgbr
1528 #Lemma
1529 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001530
Marc Kupietzb8c53822024-03-16 18:54:08 +01001531 Spacy
1532 #Morpho
1533
Akron7d5e6382019-08-08 16:36:27 +02001534 Talismane
1535 #Dependency
1536 #Morpho
1537
Akron821db3d2017-04-06 21:19:31 +02001538 TreeTagger
1539 #Morpho
1540 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001541
Akron83aedd32023-02-07 10:57:41 +01001542 UDPipe
1543 #Dependency
1544 #Morpho
1545
Akron821db3d2017-04-06 21:19:31 +02001546 XIP
1547 #Constituency
1548 #Morpho
1549 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001550
Akronc13a1702016-03-15 19:33:14 +01001551
1552More importers are in preparation.
1553New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1554See the built-in annotation importers as examples.
1555
Akronf73ffb62018-06-27 12:13:59 +02001556
Akron41e6c8b2021-10-14 20:22:18 +02001557=head1 METADATA SUPPORT
1558
1559L<KorAP::XML::Krill> has built-in importer for some meta data variants
1560developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1561
1562=over 2
1563
1564=item I5 - Meta data for all I5 files
1565
1566=item Sgbr - Meta data from the Schreibgebrauch project
1567
1568=item Gingko - Meta data from the Gingko project in addition to I5
1569
Akron2532f1b2023-05-15 13:41:24 +02001570=item ICC - Meta data for the ICC in addition to I5
1571
Akron41e6c8b2021-10-14 20:22:18 +02001572=back
1573
1574More importers are in preparation.
1575New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1576See the built-in meta data importers as examples.
1577
1578
Akron8f69d632020-01-15 16:58:11 +01001579=head1 About KorAP-XML
1580
1581KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1582data model (Bański et al. 2013), where text data are stored physically
1583separated from their interpretations (i.e. annotations).
1584A text document in KorAP-XML therefore consists of several files
1585containing primary data, metadata and annotations.
1586
1587The structure of a single KorAP-XML document can be as follows:
1588
1589 - data.xml
1590 - header.xml
1591 + base
1592 - tokens.xml
1593 - ...
1594 + struct
1595 - structure.xml
1596 - ...
1597 + corenlp
1598 - morpho.xml
1599 - constituency.xml
1600 - ...
1601 + tree_tagger
1602 - morpho.xml
1603 - ...
1604 - ...
1605
1606The C<data.xml> contains the primary data, the C<header.xml> contains
1607the metadata, and the annotation layers are stored in subfolders
1608like C<base>, C<struct> or C<corenlp>
1609(so-called "foundries"; Bański et al. 2013).
1610
1611Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001612(Lüngen and Sperberg-McQueen 2012). See the documentation in
1613L<KorAP::XML::Meta::I5> for translatable fields.
1614
1615Annotations correspond to a variant of the TEI-P5 feature structures
1616(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001617Annotation feature structures refer to character sequences of the primary text
1618inside the C<text> element of the C<data.xml>.
1619A single annotation containing the lemma of a token can have the following structure:
1620
1621 <span from="0" to="3">
1622 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1623 <f name="lex">
1624 <fs>
1625 <f name="lemma">zum</f>
1626 </fs>
1627 </f>
1628 </fs>
1629 </span>
1630
1631The C<from> and C<to> attributes are refering to the character span
1632in the primary text.
1633Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1634the structure may vary. See L<KorAP::XML::Annotation::*> for various
1635annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001636
1637Multiple KorAP-XML documents are organized on three levels following
1638the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1639corpus E<gt> document E<gt> text. On each level metadata information
1640can be stored, that C<korapxml2krill> will merge to a single metadata
1641object per text. A corpus is therefore structured as follows:
1642
1643 + <corpus>
1644 - header.xml
1645 + <document>
1646 - header.xml
1647 + <text>
1648 - data.xml
1649 - header.xml
1650 - ...
1651 - ...
1652
1653A single text can be identified by the concatenation of
1654the corpus identifier, the document identifier and the text identifier.
1655This identifier is called the text sigle
1656(e.g. a text with the identifier C<18486> in the document C<060> in the
1657corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1658
1659These corpora are often stored in zip files, with which C<korapxml2krill>
1660can deal with. Corpora may also be split in multiple zip archives
1661(e.g. one zip file per foundry), which is also supported (see C<--input>).
1662
1663Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1664in form of a test suite.
1665The resulting JSON format merges all annotation layers
1666based on a single token stream.
1667
1668=head2 References
1669
1670Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1671KorAP data model: first approximation, December.
1672
1673Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1674"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1675Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1676L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1677
1678Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1679"Robust corpus architecture: a new look at virtual collections and data access",
1680Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1681L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1682
1683Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1684Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1685"Towards an international standard on featurestructure representation",
1686Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1687pp. 373-376.
1688L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1689
1690Harald Lüngen and C. M. Sperberg-McQueen (2012):
1691"A TEI P5 Document Grammar for the IDS Text Model",
1692Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1693L<PDF|https://journals.openedition.org/jtei/pdf/508>
1694
1695TEI Consortium, eds:
1696"Feature Structures",
1697Guidelines for Electronic Text Encoding and Interchange.
1698L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1699
Akron941c1a62016-02-23 17:41:41 +01001700=head1 AVAILABILITY
1701
1702 https://github.com/KorAP/KorAP-XML-Krill
1703
1704
1705=head1 COPYRIGHT AND LICENSE
1706
Akrona3518372024-01-22 23:29:00 +01001707Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001708
Akron6882d7d2021-02-08 09:43:57 +01001709Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001710
Akrona76d8352016-10-27 16:27:32 +02001711Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001712
Akron6882d7d2021-02-08 09:43:57 +01001713L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001714Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001715L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001716member of the
Akronf1849aa2019-12-16 23:35:33 +01001717L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001718
1719This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001720L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001721
1722=cut