blob: c468b012f4f3ac0e2794b82516c719dc2117ddbf [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Marc Kupietzb8c53822024-03-16 18:54:08 +0100174# 2024/03/20
175# - Added Spacy support.
176#
Akron941c1a62016-02-23 17:41:41 +0100177# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100178
Akronc0ac4ff2024-04-15 18:03:15 +0200179our $LAST_CHANGE = '2024/04/15';
Akron941c1a62016-02-23 17:41:41 +0100180our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100181our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100182our $VERSION_MSG = <<"VERSION";
183Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
184VERSION
185
Akron941c1a62016-02-23 17:41:41 +0100186# Parse comand
187my $cmd;
188our @ARGV;
189if ($ARGV[0] && index($ARGV[0], '-') != 0) {
190 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100191};
Akron63f20d42017-04-10 23:40:29 +0200192my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100193
Akron5f51d422016-08-16 16:26:43 +0200194my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200195
196# Configuration hash
197my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100198
Akron941c1a62016-02-23 17:41:41 +0100199# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000200GetOptions(
Akron08385f62016-03-22 20:37:04 +0100201 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200202 'input-base|ib=s' => \($cfg{input_base}),
203 'output|o=s' => \($cfg{output}),
204 'overwrite|w' => \($cfg{overwrite}),
205 'meta|m=s' => \($cfg{meta}),
206 'token|t=s' => \($cfg{token}),
207 'base-sentences|bs=s' => \($cfg{base_sentences}),
208 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
209 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
210 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100211 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100212 'skip|s=s' => \@skip,
213 'sigle|sg=s' => \@sigle,
Akronc0ac4ff2024-04-15 18:03:15 +0200214 'cache|c=s' => \($cfg{cache}),
Akron636aa112017-04-07 18:48:56 +0200215 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200216 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200217 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200218 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200219 'primary|p!' => sub {
220 warn 'Primary flag no longer supported!';
221 },
Akrona3518372024-01-22 23:29:00 +0100222 'quiet' => \($cfg{quiet}),
Akron6aed0562020-08-07 16:46:00 +0200223 'pretty|y' => sub {
224 warn 'Pretty flag no longer supported!';
225 },
Akronf8df2162020-08-07 15:03:39 +0200226 'jobs|j=i' => \($cfg{jobs}),
227 'koral|k=f' => \($cfg{koral}),
228 'to-tar' => \($cfg{to_tar}),
229 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
230 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
231 'sequential-extraction|se' => \($cfg{sequential_extraction}),
232 'cache-size|cs=s' => \($cfg{cache_size}),
233 'cache-delete|cd!' => \($cfg{cache_delete}),
234 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100235 'help|h' => sub {
236 pod2usage(
237 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200238 -verbose => 99,
239 -msg => $VERSION_MSG,
240 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100241 );
242 },
243 'version|v' => sub {
244 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200245 -verbose => 0,
246 -msg => $VERSION_MSG,
247 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100248 )
249 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000250);
251
Akrone512b7c2020-08-07 16:16:12 +0200252my %ERROR_HASH = (
253 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
254 -verbose => 99,
255 -msg => $VERSION_MSG,
256 -output => '-',
257 -exit => 1
258);
Akron63f20d42017-04-10 23:40:29 +0200259
Akronf8df2162020-08-07 15:03:39 +0200260# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200261if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200262 my %config;
263
Akronf8df2162020-08-07 15:03:39 +0200264 print "Reading config from $cfg_file\n";
265
Akron636aa112017-04-07 18:48:56 +0200266 Config::Simple->import_from($cfg_file, \%config);
267
Akronf8df2162020-08-07 15:03:39 +0200268 foreach (qw!output cache-size input-base token overwrite
269 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200270 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100271 non-verbal-tokens sequential-extraction
Akronc0ac4ff2024-04-15 18:03:15 +0200272 temporary-extract cache-init cache-delete
Akrona3518372024-01-22 23:29:00 +0100273 koral extract-dir jobs quiet!) {
Akronf8df2162020-08-07 15:03:39 +0200274 my $underlined = $_ =~ tr/-/_/r;
275 if (!defined($cfg{$underlined}) && defined $config{$_}) {
276 $cfg{$underlined} = $config{$_};
277 };
Akron636aa112017-04-07 18:48:56 +0200278 };
279
280 # Skip
281 if (!scalar(@skip) && defined $config{'skip'}) {
282 @skip = split /\s*;\s*/, $config{'skip'} ;
283 };
284
285 # Sigle
286 if (!scalar(@sigle) && defined $config{'sigle'}) {
287 @sigle = split /\s*;\s*/, $config{'sigle'} ;
288 };
289
290 # Anno
291 if (!scalar(@anno) && defined $config{'anno'}) {
292 @anno = split /\s*;\s*/, $config{'anno'} ;
293 };
294};
295
Akronf8df2162020-08-07 15:03:39 +0200296# Init variables and set default values
297my $output = $cfg{output};
298my $input_base = $cfg{input_base};
299my $gzip = $cfg{gzip};
300my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100301my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200302my $token_base = $cfg{token} // 'OpenNLP#tokens';
303my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
304my $jobs = $cfg{jobs} // 0;
305my $cache_delete = $cfg{cache_delete} // 1;
306my $base_sentences = lc($cfg{base_sentences} // '');
307my $base_paragraphs = lc($cfg{base_paragraphs} // '');
308my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
309my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akrona3518372024-01-22 23:29:00 +0100310my $q = !!($cfg{quiet}) // 0;
Akron63f20d42017-04-10 23:40:29 +0200311
Akronf8df2162020-08-07 15:03:39 +0200312# Get tokenization basis
313my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200314
Akronf8df2162020-08-07 15:03:39 +0200315# Remove file extension
316$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100317
Akronf8df2162020-08-07 15:03:39 +0200318# Convert sigle to path construct
319s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
320
321my %skip;
322$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200323
Akronb9c33812020-10-21 16:19:35 +0200324Log::Any::Adapter->set(
325 'Stderr', log_level => uc($cfg{log} // 'ERROR')
326);
Akron63f20d42017-04-10 23:40:29 +0200327
Akron84b53ad2022-01-14 12:39:15 +0100328# Start log slimming
329if ($cmd && $cmd eq 'slimlog') {
330 require KorAP::XML::Log::Slim;
331
332 my $log_file = shift @ARGV;
333
334 if (-e $log_file) {
335
336 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
337
338 # Run log filter
339 $slimmer->slim_to;
340 }
341
342 else {
343 warn "Log file can't be found";
344 exit(1);
345 };
346
347 exit;
348};
349
350
Akronf8df2162020-08-07 15:03:39 +0200351if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
352 $log->error("Directory '$output' does not exist.");
353 exit 1;
354};
Akron63f20d42017-04-10 23:40:29 +0200355
Akron941c1a62016-02-23 17:41:41 +0100356# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100357pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000358
Akrone1dbc382016-07-08 22:24:52 +0200359# Gzip has no effect, if no output is given
360pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000361
Akron63f20d42017-04-10 23:40:29 +0200362# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200363if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200364
Akron63f20d42017-04-10 23:40:29 +0200365 # Remove all inputs
366 my $remove_next = 0;
367 @keep_argv = @{c(@keep_argv)->grep(
368 sub {
369 # Input flag
370 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
371 $remove_next = 1;
372 return 0;
373 }
374
375 # input value
376 elsif ($remove_next) {
377 $remove_next = 0;
378 return 0;
379 };
380
381 # Pass parameter
382 return 1;
383 }
384 )->to_array};
385
386
387 # Iterate over all inputs
388 foreach (@input) {
389
Akron081639e2017-04-21 19:01:39 +0200390 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200391 my $new_out = catdir($output, get_file_name_from_glob($_));
392
Akron486f9ab2017-04-22 23:25:19 +0200393 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200394 unless ($to_tar) {
395 if (make_path($new_out) == 0 && !-d $new_out) {
396 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200397 exit 1;
Akron081639e2017-04-21 19:01:39 +0200398 };
Akron63f20d42017-04-10 23:40:29 +0200399 };
400
401 # Create archive command
402 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
Akrona3518372024-01-22 23:29:00 +0100403 unless ($q) {
404 print "Start serial processing of $_ to $new_out\n";
405 print 'Command: ', join(' ', @archive_cmd), "\n";
406 };
Akron63f20d42017-04-10 23:40:29 +0200407
408 # Start archiving
409 system @archive_cmd;
410 };
411
Akron3abc03e2017-06-29 16:23:35 +0200412 exit;
Akron63f20d42017-04-10 23:40:29 +0200413};
414
Akron5c602cb2020-08-07 17:00:52 +0200415# Define supported (and preinstalled) transformation modules
416my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100417push(@layers, ['Base', 'Sentences']) unless $base_sentences;
418push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200419
420# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200421push(@layers, ['Connexor', 'Morpho'],
422 ['Connexor', 'Syntax'],
423 ['Connexor', 'Phrase'],
424 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200425
426# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200427push(@layers,
428 ['CoreNLP', 'NamedEntities'],
429 ['CoreNLP', 'Sentences'],
430 ['CoreNLP', 'Morpho'],
431 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200432
Akronce125b62017-06-19 11:54:36 +0200433# CMC
434push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100435
Akrone1dbc382016-07-08 22:24:52 +0200436# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100437my @dereko_attr = ();
438if ($base_sentences eq 'dereko#structure') {
439 push @dereko_attr, 'sentences';
440};
441if ($base_paragraphs eq 'dereko#structure') {
442 push @dereko_attr, 'paragraphs';
443};
Akron636bd9c2017-02-09 17:13:00 +0100444
Akron41ac10b2017-02-08 22:47:25 +0100445if ($base_pagebreaks eq 'dereko#structure') {
446 push @dereko_attr, 'pagebreaks';
447};
448
449if ($dereko_attr[0]) {
450 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100451}
452else {
453 push(@layers, ['DeReKo', 'Structure']);
454};
Akrone1dbc382016-07-08 22:24:52 +0200455
Akron57510c12019-01-04 14:58:53 +0100456# DGD
457push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100458if ($base_sentences eq 'dgd#structure') {
459 push(@layers, ['DGD', 'Structure', 'base-sentence']);
460}
Akron57510c12019-01-04 14:58:53 +0100461
462# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200463push(@layers,
464 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100465
Akronabb36902021-10-11 15:51:06 +0200466# Gingko
467push(@layers,
468 ['Gingko', 'Morpho']);
469
Akrone1dbc382016-07-08 22:24:52 +0200470# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200471push(@layers,
472 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200473
Akronea1aed52018-07-19 14:43:34 +0200474# HNC
Akron5c602cb2020-08-07 17:00:52 +0200475push(@layers,
476 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200477
Akron4c679192018-01-16 17:41:49 +0100478# LWC
Akron5c602cb2020-08-07 17:00:52 +0200479push(@layers,
480 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100481
Akrone1dbc382016-07-08 22:24:52 +0200482# Malt
Akron5c602cb2020-08-07 17:00:52 +0200483push(@layers,
484 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200485
Akron57510c12019-01-04 14:58:53 +0100486# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200487push(@layers,
488 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200489
490# Mate
Akron5c602cb2020-08-07 17:00:52 +0200491push(@layers,
492 ['Mate', 'Morpho'],
493 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200494
Akron57510c12019-01-04 14:58:53 +0100495# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200496push(@layers,
497 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100498
Akron88d063a2022-03-21 15:10:01 +0100499# NKJP
500push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200501 ['NKJP', 'Morpho'],
502 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100503
Akrone1dbc382016-07-08 22:24:52 +0200504# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200505push(@layers,
506 ['OpenNLP', 'Morpho'],
507 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200508
Akron07e24772020-04-23 14:00:54 +0200509# Redewiedergabe
510push(@layers, ['RWK', 'Morpho']);
511if ($base_sentences eq 'rwk#structure') {
512 push(@layers, ['RWK', 'Structure']);
513};
514
Akrone1dbc382016-07-08 22:24:52 +0200515# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200516push(@layers,
517 ['Sgbr', 'Lemma'],
518 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200519
Marc Kupietzb8c53822024-03-16 18:54:08 +0100520# Spacy
521push(@layers,
522 ['Spacy', 'Morpho']);
523
Akron7d5e6382019-08-08 16:36:27 +0200524# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200525push(@layers,
526 ['Talismane', 'Dependency'],
527 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200528
Akrone1dbc382016-07-08 22:24:52 +0200529# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200530push(@layers,
531 ['TreeTagger', 'Morpho'],
532 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200533
Marc Kupietz400590b2022-12-23 16:02:36 +0100534# UDPipe
535push(@layers,
536 ['UDPipe', 'Morpho'],
537 ['UDPipe', 'Dependency']);
538
Akrone1dbc382016-07-08 22:24:52 +0200539# XIP
Akron5c602cb2020-08-07 17:00:52 +0200540push(@layers,
541 ['XIP', 'Morpho'],
542 ['XIP', 'Constituency'],
543 ['XIP', 'Sentences'],
544 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200545
Akron4fa37c32017-01-20 14:43:10 +0100546
Akrone1dbc382016-07-08 22:24:52 +0200547# Check filters
548my @filtered_anno;
549if ($skip{'#all'}) {
550 foreach (@anno) {
551 push @filtered_anno, [ split('#', $_) ];
552 };
553}
554
555# Add all annotations that are not skipped
556else {
557 # Add to index file - respect skipping
558 foreach my $info (@layers) {
559 # Skip if Foundry or Foundry#Layer should be skipped
560 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
561 push @filtered_anno, $info;
562 };
563 };
564};
565
Akrone1dbc382016-07-08 22:24:52 +0200566
567# TODO: This should not be initialized for batch
568my $cache = Cache::FastMmap->new(
569 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200570 cache_size => ($cfg{cache_size} // '50m'),
Akronc0ac4ff2024-04-15 18:03:15 +0200571 init_file => ($cfg{cache_init} // 1),
572 unlink_on_exit => $cache_delete
Akrone1dbc382016-07-08 22:24:52 +0200573);
574
Akron03b24db2016-08-16 20:54:32 +0200575# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200576my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200577 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200578 meta_type => $cfg{meta},
579 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200580 foundry => $token_base_foundry,
581 layer => $token_base_layer,
582 gzip => $gzip,
583 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200584 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100585 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200586 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200587 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
588 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200589);
590
Akrone512b7c2020-08-07 16:16:12 +0200591# Auto adjust jobs
592if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100593 my $cores = 1;
594 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
595 $cores = Sys::Info->new->device('CPU')->count;
596 }
597 else {
598 $log->warn("Unable to determine number of cores");
599 };
600
Akrone512b7c2020-08-07 16:16:12 +0200601 $jobs = ceil(5 * $cores);
602 $log->info("Run using $jobs jobs on $cores cores");
603};
604
Akron63f20d42017-04-10 23:40:29 +0200605# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200606if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200607
Akron821db3d2017-04-06 21:19:31 +0200608 my @new_input = ();
609
610 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200611 foreach my $wild_card (@input) {
612
613 # Prefix with input root
614 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
615
616 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200617 };
618
Akron63f20d42017-04-10 23:40:29 +0200619 # Sort files by length
620 @input = sort { length($a) <=> length($b) } @new_input;
621
Akrona3518372024-01-22 23:29:00 +0100622 print 'Input is ' . join(', ', @input)."\n" unless $q;
Akron821db3d2017-04-06 21:19:31 +0200623};
624
Akron941c1a62016-02-23 17:41:41 +0100625# Process a single file
626unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100627 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000628
Akron941c1a62016-02-23 17:41:41 +0100629 BEGIN {
630 $main::TIME = Benchmark->new;
631 $main::LAST_STOP = Benchmark->new;
632 };
633
634 sub stop_time {
635 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200636 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100637 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200638 timestr(timediff($new, $main::LAST_STOP)) .
639 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
640 );
Akron941c1a62016-02-23 17:41:41 +0100641 $main::LAST_STOP = $new;
642 };
643
644 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200645 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100646
Akron7d4cdd82016-08-17 21:39:45 +0200647 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200648 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100649
Akron5f51d422016-08-16 16:26:43 +0200650 stop_time;
Akronc0ac4ff2024-04-15 18:03:15 +0200651
Akron3abc03e2017-06-29 16:23:35 +0200652 exit;
Akron81500102017-04-07 20:45:44 +0200653};
654
Nils Diewald59094f22014-11-05 18:20:50 +0000655
Akrone10ad322016-02-27 10:54:26 +0100656# Extract XML files
Akron81500102017-04-07 20:45:44 +0200657if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100658
Akrond5643ad2017-07-04 20:27:13 +0200659 # Output is required
660 pod2usage(%ERROR_HASH) unless $output;
661
Akron7d4cdd82016-08-17 21:39:45 +0200662 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200663 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100664
Akron7d4cdd82016-08-17 21:39:45 +0200665 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100666 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200667 $log->error("Unzip is not installed or incompatible.");
668 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100669 };
670
Akronb0c88db2016-06-29 16:33:18 +0200671 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200672 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200673
Akron31a08cb2019-02-20 20:43:26 +0100674 # Will set @sigle
675 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200676
Akrone10ad322016-02-27 10:54:26 +0100677 # Iterate over all given sigles and extract
678 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100679
Akrona3518372024-01-22 23:29:00 +0100680 unless ($q) {
681 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200682
Akrona3518372024-01-22 23:29:00 +0100683 # TODO: Make this OS independent
684 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100685
Akrona3518372024-01-22 23:29:00 +0100686 # TODO:
687 # - prefix???
688 $archive->extract_sigle(0, [$_], $output, $jobs)
689 ? '' : 'not '
690 );
691 print "extracted.\n";
692 } else {
693 $archive->extract_sigle(1, [$_], $output, $jobs);
694 }
Akrone10ad322016-02-27 10:54:26 +0100695 };
Akronb0c88db2016-06-29 16:33:18 +0200696 }
Akron7d4cdd82016-08-17 21:39:45 +0200697
698 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200699 else {
700 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200701 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100702 };
703}
704
Akron81500102017-04-07 20:45:44 +0200705
Akron941c1a62016-02-23 17:41:41 +0100706# Process an archive
707elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000708
Akron81500102017-04-07 20:45:44 +0200709 my $archive_output;
710
711 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100712 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200713
714 # Create new archive object
715 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
716
717 # Check zip capabilities
718 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200719 $log->error("Unzip is not installed or incompatible.");
720 exit 1;
Akron81500102017-04-07 20:45:44 +0200721 };
722
723 # Add further annotation archived
724 $archive->attach($_) foreach @input[1..$#input];
725
726 # Create a temporary directory
727 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200728 $extract_dir = tempdir(CLEANUP => 0);
Akrona3518372024-01-22 23:29:00 +0100729 print "Temporarily extract to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200730 };
731
Akron63f20d42017-04-10 23:40:29 +0200732 # Add some random extra to avoid clashes with multiple archives
733 $extract_dir = catdir($extract_dir, random_string('cccccc'));
734
Akron31a08cb2019-02-20 20:43:26 +0100735 # Extract to temporary directory
Akrona3518372024-01-22 23:29:00 +0100736 if ($archive->extract_all($q, $extract_dir, $sequential_extraction ? 1: $jobs)) {
737 print "Extract sequentially to $extract_dir\n" unless $q;
Akron81500102017-04-07 20:45:44 +0200738 @input = ($extract_dir);
739 }
740 else {
741 $log->error('Unable to extract from primary archive ' . $input[0] .
742 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200743 exit 1;
Akron81500102017-04-07 20:45:44 +0200744 };
745 }
746
747 # Can't create archive object
748 else {
749 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200750 exit 1;
Akron81500102017-04-07 20:45:44 +0200751 };
752 };
753
Akron7d4cdd82016-08-17 21:39:45 +0200754 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100755 my $pool = Parallel::ForkManager->new($jobs);
756
Akron7d4cdd82016-08-17 21:39:45 +0200757 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100758 my $iter = 1; # Current text in process
759
Akronda3097e2017-04-23 19:53:57 +0200760 my $tar_archive;
761 my $output_dir = $output;
762 my $tar_fh;
763
764 # Initialize tar archive
765 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200766
767 # Set output name
768 my $tar_file = $output;
769 unless ($tar_file =~ /\.tar$/) {
770 $tar_file .= '.tar';
771 };
772
773 # Initiate the tar file
Akrona3518372024-01-22 23:29:00 +0100774 print "Writing to file $tar_file\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200775 $tar_fh = IO::File->new($tar_file, 'w');
776 $tar_fh->binmode(1);
777
Akroneb370a02022-02-24 13:33:40 +0100778 # Use tar builder for archiving
779 if (eval("use Archive::Tar::Builder; 1;")) {
780 $tar_archive = Archive::Tar::Builder->new(
781 ignore_errors => 1
782 );
783
784 # Set handle
785 $tar_archive->set_handle($tar_fh);
786 }
787
788 # Fallback solution
789 else {
790 $tar_archive = KorAP::XML::TarBuilder->new(
791 $tar_fh
792 );
793 };
Akronda3097e2017-04-23 19:53:57 +0200794
795 # Output to temporary directory
796 $output_dir = File::Temp->newdir;
797 };
798
Akron941c1a62016-02-23 17:41:41 +0100799 # Report on fork message
800 $pool->run_on_finish (
801 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200802 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100803 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200804
Akrona3518372024-01-22 23:29:00 +0100805 unless ($q) {
806 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
807 $iter . "/$count]" .
808 ($code ? " $code" : '') .
809 ' ' . $data->[0] . "\n";
810 };
811 $iter++;
Akronda3097e2017-04-23 19:53:57 +0200812
813 if (!$code && $to_tar && $data->[2]) {
814 my $filename = $data->[2];
815
816 # Lock filehandle
817 if (flock($tar_fh, LOCK_EX)) {
818
Akron9a062ce2017-07-04 19:12:05 +0200819 my $clean_file = fileparse($filename);
820
Akronda3097e2017-04-23 19:53:57 +0200821 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200822 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200823 unlink $filename;
824
825 # Unlock filehandle
826 flock($tar_fh, LOCK_UN);
827 }
828 else {
829 $log->warn("Unable to add $filename to archive");
830 };
831 };
832
Akron4c0cf312016-10-15 16:42:09 +0200833 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100834 }
835 );
836
837 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200838 my $temp;
Akrona3518372024-01-22 23:29:00 +0100839 print "Reading data ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100840
Akron7d4cdd82016-08-17 21:39:45 +0200841 # unless (Cache::FastMmap->new(
842 # share_file => $cache_file,
843 # cache_size => $cache_size,
844 # init_file => $cache_init
845 # )) {
846 # print "Unable to intialize cache '$cache_file'\n\n";
847 # exit(1);
848 # };
Akron11c80302016-03-18 19:44:43 +0100849
Akron486f9ab2017-04-22 23:25:19 +0200850
Akron941c1a62016-02-23 17:41:41 +0100851 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100852 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200853 # TODO:
854 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100855 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100856 my @dirs;
857 my $dir;
858
Akron7d4cdd82016-08-17 21:39:45 +0200859 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100860 while (1) {
861 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200862 push @dirs, $dir;
863 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100864 };
865 last unless $it->next;
866 };
867
Akrona3518372024-01-22 23:29:00 +0100868 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100869 $t = Benchmark->new;
870 $count = scalar @dirs;
871
872 DIRECTORY_LOOP:
873 for (my $i = 0; $i < $count; $i++) {
874
Akrone1dbc382016-07-08 22:24:52 +0200875 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200876 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200877 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200878 );
Akron941c1a62016-02-23 17:41:41 +0100879
880 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200881 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200882
Akron13d56622016-10-31 14:54:49 +0100883 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200884 $pool->finish(
885 0,
Akronda3097e2017-04-23 19:53:57 +0200886 [
887 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
888 undef,
889 $filename
890 ]
Akron486f9ab2017-04-22 23:25:19 +0200891 );
Akron3ec48972016-08-17 23:24:52 +0200892 }
893 else {
Akron4c0cf312016-10-15 16:42:09 +0200894 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200895 };
Akron941c1a62016-02-23 17:41:41 +0100896 };
897 }
898
899 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200900 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200901
Akron941c1a62016-02-23 17:41:41 +0100902 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200903 $log->error("Unzip is not installed or incompatible.");
904 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100905 };
906
Akron08385f62016-03-22 20:37:04 +0100907 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200908 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100909
Akron31a08cb2019-02-20 20:43:26 +0100910 # Get sigles to extract
911 my $prefix = set_sigle($archive);
912
Akrona3518372024-01-22 23:29:00 +0100913 print "Start processing ...\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100914 $t = Benchmark->new;
915 my @dirs = $archive->list_texts;
916 $count = scalar @dirs;
917
918 ARCHIVE_LOOP:
919 for (my $i = 0; $i < $count; $i++) {
920
921 # Split path information
922 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
923
Akrone1dbc382016-07-08 22:24:52 +0200924 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200925 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200926 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200927 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200928 catfile($corpus, $doc, $text)
929 . '.json' . ($gzip ? '.gz' : '')
930 )
Akrone1dbc382016-07-08 22:24:52 +0200931 );
Akron941c1a62016-02-23 17:41:41 +0100932
933 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200934 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100935
Akron4c0cf312016-10-15 16:42:09 +0200936 # Create temporary file
937 $temp = File::Temp->newdir;
938
Akronbdf434a2016-10-24 17:42:07 +0200939 # TODO: Check if $filename exist at the beginning,
940 # because extraction can be horrible slow!
941
Akron941c1a62016-02-23 17:41:41 +0100942 # Extract from archive
Akrona3518372024-01-22 23:29:00 +0100943 if ($archive->extract_sigle($q, [join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100944
Akron7d4cdd82016-08-17 21:39:45 +0200945 # Create corpus directory
946 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100947
Akron7d4cdd82016-08-17 21:39:45 +0200948 # Temporary directory
949 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100950
Akron7d4cdd82016-08-17 21:39:45 +0200951 # Write file
Akron13d56622016-10-31 14:54:49 +0100952 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200953
Akron4c0cf312016-10-15 16:42:09 +0200954 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100955 $pool->finish(
956 0,
Akronda3097e2017-04-23 19:53:57 +0200957 [
958 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
959 $temp,
960 $filename
961 ]
Akron13d56622016-10-31 14:54:49 +0100962 );
Akron7d4cdd82016-08-17 21:39:45 +0200963 }
964 else {
Akron4c0cf312016-10-15 16:42:09 +0200965 # Delete temporary file
966 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200967 };
Akron941c1a62016-02-23 17:41:41 +0100968 }
Akron7d4cdd82016-08-17 21:39:45 +0200969
970 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100971 else {
Akron4c0cf312016-10-15 16:42:09 +0200972 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100973 };
974 };
975 }
976
977 else {
Akrona3518372024-01-22 23:29:00 +0100978 print "Input is neither a directory nor an archive.\n\n" unless $q;
Akron941c1a62016-02-23 17:41:41 +0100979 };
980
981 $pool->wait_all_children;
982
Akronda3097e2017-04-23 19:53:57 +0200983 # Close tar filehandle
984 if ($to_tar && $tar_fh) {
985 $tar_archive->finish;
986 $tar_fh->close;
Akrona3518372024-01-22 23:29:00 +0100987 print "Wrote to tar archive.\n" unless $q;
Akronda3097e2017-04-23 19:53:57 +0200988 };
Akrona3518372024-01-22 23:29:00 +0100989 unless ($q) {
990 print timestr(timediff(Benchmark->new, $t))."\n";
991 print "Done.\n";
992 };
Akron81500102017-04-07 20:45:44 +0200993};
Akron941c1a62016-02-23 17:41:41 +0100994
Nils Diewald2db9ad02013-10-29 19:26:43 +0000995
Akron31a08cb2019-02-20 20:43:26 +0100996# For an archive, this will create the list
997# of all sigles to process
998sub set_sigle {
999 my $archive = shift;
1000
1001 my $prefix = 1;
1002 my @dirs = ();
1003
1004 # No sigles given
1005 unless (@sigle) {
1006
1007 # Get files
1008 foreach ($archive->list_texts) {
1009
1010 push @dirs, $_;
1011
1012 # Split path information
1013 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1014
1015 # TODO: Make this OS independent
1016 push @sigle, join '/', $corpus, $doc, $text;
1017 };
1018 }
1019
1020 # Check sigle for doc sigles
1021 else {
1022 my @new_sigle;
1023
1024 my $prefix_check = 0;
1025
1026 # Iterate over all sigle
1027 foreach (@sigle) {
1028
1029 # Sigle is a doc sigle
1030 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1031
Akrona3518372024-01-22 23:29:00 +01001032 print "$_ ..." unless $q;
Akron31a08cb2019-02-20 20:43:26 +01001033 # Check if a prefix is needed
1034 unless ($prefix_check) {
1035
Akrona3518372024-01-22 23:29:00 +01001036 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001037 print " with prefix ...";
1038 };
1039 $prefix_check = 1;
1040 };
1041
Akrona3518372024-01-22 23:29:00 +01001042 unless ($q) {
1043 print "\n";
Akron31a08cb2019-02-20 20:43:26 +01001044
Akrona3518372024-01-22 23:29:00 +01001045 print '... ' . (
1046 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
Akron955b75b2019-02-21 14:28:41 +01001047 ? '' : 'not '
Akrona3518372024-01-22 23:29:00 +01001048 );
1049 print "extracted.\n";
1050 }
1051 else {
1052 $archive->extract_sigle($q, [$_], $output, $sequential_extraction ? 1 : $jobs)
1053 };
Akron31a08cb2019-02-20 20:43:26 +01001054 }
1055
1056 # Sigle is a text sigle
1057 else {
1058 push @new_sigle, $_;
1059
1060 unless ($prefix_check) {
1061
Akrona3518372024-01-22 23:29:00 +01001062 if ($prefix = $archive->check_prefix && !$q) {
Akron31a08cb2019-02-20 20:43:26 +01001063 print " with prefix ...";
1064 };
1065 $prefix_check = 1;
1066 };
1067 };
1068 };
1069 @sigle = @new_sigle;
1070 };
1071
1072 return $prefix;
1073};
1074
1075
Akron63f20d42017-04-10 23:40:29 +02001076# Cleanup temporary extraction directory
1077if ($extract_dir) {
1078 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001079 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001080};
1081
1082
1083print "\n";
1084
Nils Diewald2db9ad02013-10-29 19:26:43 +00001085__END__
Akron941c1a62016-02-23 17:41:41 +01001086
1087=pod
1088
1089=encoding utf8
1090
1091=head1 NAME
1092
Akron42f48c12020-02-14 13:08:13 +01001093korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001094
1095
1096=head1 SYNOPSIS
1097
Akron9cb8c982024-03-22 10:46:56 +01001098 $ korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001099
Akron2fd402b2016-10-27 21:26:48 +02001100
Akron941c1a62016-02-23 17:41:41 +01001101=head1 DESCRIPTION
1102
1103L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1104compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001105The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001106
1107
1108=head1 INSTALLATION
1109
1110The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1111
Akron9cb8c982024-03-22 10:46:56 +01001112 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001113
Akronc13a1702016-03-15 19:33:14 +01001114In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001115be available on your command line immediately.
Akron8ce23f72023-12-13 15:48:49 +01001116Minimum requirement for L<KorAP::XML::Krill> is Perl 5.32.
Akroneb370a02022-02-24 13:33:40 +01001117Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1118Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001119In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001120
1121=head1 ARGUMENTS
1122
Akron9cb8c982024-03-22 10:46:56 +01001123 $ korapxml2krill -z --input <directory> --output <filename>
Akrona76d8352016-10-27 16:27:32 +02001124
1125Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001126It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001127
Akron941c1a62016-02-23 17:41:41 +01001128=over 2
1129
1130=item B<archive>
1131
Akron9cb8c982024-03-22 10:46:56 +01001132 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001133
Akron2fd402b2016-10-27 21:26:48 +02001134Converts an archive of KorAP-XML documents. It expects a directory
1135(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001136
1137=item B<extract>
1138
Akron9cb8c982024-03-22 10:46:56 +01001139 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Akrona76d8352016-10-27 16:27:32 +02001140
1141Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001142
Akron63f20d42017-04-10 23:40:29 +02001143=item B<serial>
1144
Akron9cb8c982024-03-22 10:46:56 +01001145 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Akron63f20d42017-04-10 23:40:29 +02001146
1147Convert archives sequentially. The inputs are not merged but treated
1148as they are (so they may be premerged or globs).
1149the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001150are created based on the archive name. In case the C<--to-tar> flag is given,
1151the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001152
1153
Akron84b53ad2022-01-14 12:39:15 +01001154=item B<slimlog>
1155
Akron9cb8c982024-03-22 10:46:56 +01001156 $ korapxml2krill slimlog <logfile> > <logfile-slim>
Akron84b53ad2022-01-14 12:39:15 +01001157
1158Filters out all useless aka succesfull information from logs, to simplify
1159log checks. Expects no further options.
1160
1161
Akron941c1a62016-02-23 17:41:41 +01001162=back
1163
1164
1165=head1 OPTIONS
1166
1167=over 2
1168
Akrona76d8352016-10-27 16:27:32 +02001169=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001170
Akrona76d8352016-10-27 16:27:32 +02001171Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001172
Akron7606afa2016-10-25 16:23:49 +02001173Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001174document, while C<archive> expects a KorAP-XML corpus folder or a zip
1175file to batch process multiple files.
1176C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001177
Akrona76d8352016-10-27 16:27:32 +02001178C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001179that the first archive listed contains all primary data files
1180and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001181
Akron7606afa2016-10-25 16:23:49 +02001182 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001183
Akron821db3d2017-04-06 21:19:31 +02001184Input may also be defined using BSD glob wildcards.
1185
1186 -i 'file/news*.zip'
1187
1188The extended input array will be sorted in length order, so the shortest
1189path needs to contain all primary data files and all meta data files.
1190
Akron0c3e3752016-06-28 15:55:53 +02001191(The directory structure follows the base directory format,
1192that may include a C<.> root folder.
1193In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001194need to be passed with a hash sign in front of the archive's name.
1195This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001196
Akron7606afa2016-10-25 16:23:49 +02001197To support zip files, a version of C<unzip> needs to be installed that is
1198compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001199
Akron7606afa2016-10-25 16:23:49 +02001200B<The root folder switch using the hash sign is experimental and
1201may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001202
Akronf73ffb62018-06-27 12:13:59 +02001203
Akron63f20d42017-04-10 23:40:29 +02001204=item B<--input-base|-ib> <directory>
1205
1206The base directory for inputs.
1207
1208
Akron941c1a62016-02-23 17:41:41 +01001209=item B<--output|-o> <directory|file>
1210
1211Output folder for archive processing or
1212document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001213writes to C<STDOUT> by default
1214(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001215
1216=item B<--overwrite|-w>
1217
1218Overwrite files that already exist.
1219
Akronf73ffb62018-06-27 12:13:59 +02001220
Akron3741f8b2016-12-21 19:55:21 +01001221=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001222
1223Define the default tokenization by specifying
1224the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001225of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001226This will directly take the file instead of running
1227the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001228
Akron3741f8b2016-12-21 19:55:21 +01001229
1230=item B<--base-sentences|-bs> <foundry>#<layer>
1231
1232Define the layer for base sentences.
1233If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001234Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1235layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001236
1237 Defaults to unset.
1238
1239
1240=item B<--base-paragraphs|-bp> <foundry>#<layer>
1241
1242Define the layer for base paragraphs.
1243If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001244Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1245layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001246
1247 Defaults to unset.
1248
1249
Akron41ac10b2017-02-08 22:47:25 +01001250=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1251
1252Define the layer for base pagebreaks.
1253Currently C<DeReKo#Structure> is the only layer supported.
1254
1255 Defaults to unset.
1256
1257
Akron941c1a62016-02-23 17:41:41 +01001258=item B<--skip|-s> <foundry>[#<layer>]
1259
Akronf7ad89e2016-03-16 18:22:47 +01001260Skip specific annotations by specifying the foundry
1261(and optionally the layer with a C<#>-prefix),
1262e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001263Can be set multiple times.
1264
Akronf73ffb62018-06-27 12:13:59 +02001265
Akronc13a1702016-03-15 19:33:14 +01001266=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001267
Akronf7ad89e2016-03-16 18:22:47 +01001268Convert specific annotations by specifying the foundry
1269(and optionally the layer with a C<#>-prefix),
1270e.g. C<Mate> or C<Mate#Morpho>.
1271Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001272
Akronf73ffb62018-06-27 12:13:59 +02001273
Akroned9baf02019-01-22 17:03:25 +01001274=item B<--non-word-tokens|-nwt>
1275
1276Tokenize non-word tokens like word tokens (defined as matching
1277C</[\d\w]/>). Useful to treat punctuations as tokens.
1278
1279 Defaults to unset.
1280
Akronf1849aa2019-12-16 23:35:33 +01001281
1282=item B<--non-verbal-tokens|-nvt>
1283
1284Tokenize non-verbal tokens marked as in the primary data as
1285the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1286
1287 Defaults to unset.
1288
1289
Akron941c1a62016-02-23 17:41:41 +01001290=item B<--jobs|-j>
1291
Akron29128262024-04-17 15:50:36 +02001292Define the number of spawned forks for concurrent jobs
1293of archive processing.
Akron11c80302016-03-18 19:44:43 +01001294Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001295
Akrona472a242023-02-13 13:46:30 +01001296If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001297also apply to extraction.
1298
Akronc11f7982017-02-21 21:20:14 +01001299Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001300times the number of available cores, in case L<Sys::Info>
Akron29128262024-04-17 15:50:36 +02001301is available. Be aware, that the report of available cores
1302may not work in certain conditions. Benchmarking the processing
1303speed based on the number of jobs may be valuable.
Akronf7ad89e2016-03-16 18:22:47 +01001304This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001305
Akronf73ffb62018-06-27 12:13:59 +02001306
Akron263274c2019-02-07 09:48:30 +01001307=item B<--koral|-k>
1308
1309Version of the output format. Supported versions are:
1310C<0> for legacy serialization, C<0.03> for serialization
1311with metadata fields as key-values on the root object,
1312C<0.4> for serialization with metadata fields as a list
1313of C<"@type":"koral:field"> objects.
1314
1315Currently defaults to C<0.03>.
1316
1317
Akron9ec88872017-04-12 16:29:06 +02001318=item B<--sequential-extraction|-se>
1319
1320Flag to indicate, if the C<jobs> value also applies to extraction.
1321Some systems may have problems with extracting multiple archives
1322to the same folder at the same time.
1323Can be flagged using C<--no-sequential-extraction> as well.
1324Defaults to C<false>.
1325
Akronf73ffb62018-06-27 12:13:59 +02001326
Akron35db6e32016-03-17 22:42:22 +01001327=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001328
Akron35db6e32016-03-17 22:42:22 +01001329Define the metadata parser to use. Defaults to C<I5>.
1330Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1331This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001332
Akronf73ffb62018-06-27 12:13:59 +02001333
Akron941c1a62016-02-23 17:41:41 +01001334=item B<--gzip|-z>
1335
Akronf7ad89e2016-03-16 18:22:47 +01001336Compress the output.
1337Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001338
Akronf73ffb62018-06-27 12:13:59 +02001339
Akron11c80302016-03-18 19:44:43 +01001340=item B<--cache|-c>
1341
1342File to mmap a cache (using L<Cache::FastMmap>).
1343Defaults to C<korapxml2krill.cache> in the calling directory.
1344
Akronf73ffb62018-06-27 12:13:59 +02001345
Akron11c80302016-03-18 19:44:43 +01001346=item B<--cache-size|-cs>
1347
1348Size of the cache. Defaults to C<50m>.
1349
Akronf73ffb62018-06-27 12:13:59 +02001350
Akron11c80302016-03-18 19:44:43 +01001351=item B<--cache-init|-ci>
1352
1353Initialize cache file.
1354Can be flagged using C<--no-cache-init> as well.
1355Defaults to C<true>.
1356
Akronf73ffb62018-06-27 12:13:59 +02001357
Akron11c80302016-03-18 19:44:43 +01001358=item B<--cache-delete|-cd>
1359
1360Delete cache file after processing.
1361Can be flagged using C<--no-cache-delete> as well.
1362Defaults to C<true>.
1363
Akronf73ffb62018-06-27 12:13:59 +02001364
Akron636aa112017-04-07 18:48:56 +02001365=item B<--config|-cfg>
1366
1367Configure the parameters of your call in a file
1368of key-value pairs with whitespace separator
1369
1370 overwrite 1
1371 token DeReKo#Structure
1372 ...
1373
1374Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001375C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akronc0ac4ff2024-04-15 18:03:15 +02001376C<token>, C<log>,
1377C<cache>, C<cache-size>, C<cache-init>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001378C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001379C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001380C<base-sentences>, C<base-paragraphs>,
1381C<base-pagebreaks>,
1382C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001383(semicolon separated), C<anno> (semicolon separated).
1384
Akronf73ffb62018-06-27 12:13:59 +02001385Configuration parameters will always be overwritten by
1386passed parameters.
1387
1388
Akron81500102017-04-07 20:45:44 +02001389=item B<--temporary-extract|-te>
1390
Akrona472a242023-02-13 13:46:30 +01001391Only valid for the C<archive> and C<serial>
1392commands.
Akron81500102017-04-07 20:45:44 +02001393
1394This will first extract all files into a
1395directory and then will archive.
1396If the directory is given as C<:temp:>,
1397a temporary directory is used.
1398This is especially useful to avoid
1399massive unzipping and potential
1400network latency.
Akron636aa112017-04-07 18:48:56 +02001401
Akronf73ffb62018-06-27 12:13:59 +02001402
Akronc93a0802019-07-11 15:48:34 +02001403=item B<--to-tar>
1404
1405Only valid for the C<archive> command.
1406
1407Writes the output into a tar archive.
1408
1409
Akrone10ad322016-02-27 10:54:26 +01001410=item B<--sigle|-sg>
1411
Akron20807582016-10-26 17:11:34 +02001412Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001413Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001414I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001415Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001416In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001417On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001418
Akron64f7fae2022-07-27 12:45:33 +02001419=item B<--lang>
1420
1421Preferred language for metadata fields. In case multiple titles are
1422given (on any level) with different C<xml:lang> attributes,
1423the language given is preferred.
1424Because titles may have different sources and different priorities,
1425non-specific language titles may still be preferred in case the title
1426source has a higher priority.
1427
Akronf73ffb62018-06-27 12:13:59 +02001428
Akron941c1a62016-02-23 17:41:41 +01001429=item B<--log|-l>
1430
Akronb9c33812020-10-21 16:19:35 +02001431The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001432
Akronf73ffb62018-06-27 12:13:59 +02001433
Akrona3518372024-01-22 23:29:00 +01001434=item B<--quiet>
1435
1436Silence all information (non-log) outputs.
1437
1438
Akron941c1a62016-02-23 17:41:41 +01001439=item B<--help|-h>
1440
Akron42f48c12020-02-14 13:08:13 +01001441Print help information.
Akron941c1a62016-02-23 17:41:41 +01001442
Akronf73ffb62018-06-27 12:13:59 +02001443
Akron941c1a62016-02-23 17:41:41 +01001444=item B<--version|-v>
1445
1446Print version information.
1447
1448=back
1449
Akronf73ffb62018-06-27 12:13:59 +02001450
Akronc13a1702016-03-15 19:33:14 +01001451=head1 ANNOTATION SUPPORT
1452
1453L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1454developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1455The base foundry with paragraphs, sentences, and the text element are mandatory for
1456L<Krill|https://github.com/KorAP/Krill>.
1457
Akron821db3d2017-04-06 21:19:31 +02001458 Base
1459 #Paragraphs
1460 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001461
Akron821db3d2017-04-06 21:19:31 +02001462 Connexor
1463 #Morpho
1464 #Phrase
1465 #Sentences
1466 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001467
Akron821db3d2017-04-06 21:19:31 +02001468 CoreNLP
1469 #Constituency
1470 #Morpho
1471 #NamedEntities
1472 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001473
Akronce125b62017-06-19 11:54:36 +02001474 CMC
1475 #Morpho
1476
Akron821db3d2017-04-06 21:19:31 +02001477 DeReKo
1478 #Structure
Akronc13a1702016-03-15 19:33:14 +01001479
Akron57510c12019-01-04 14:58:53 +01001480 DGD
1481 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001482 #Structure
Akron57510c12019-01-04 14:58:53 +01001483
Akron821db3d2017-04-06 21:19:31 +02001484 DRuKoLa
1485 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001486
Akron821db3d2017-04-06 21:19:31 +02001487 Glemm
1488 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001489
Akronabb36902021-10-11 15:51:06 +02001490 Gingko
1491 #Morpho
1492
Akronea1aed52018-07-19 14:43:34 +02001493 HNC
1494 #Morpho
1495
Akron4c679192018-01-16 17:41:49 +01001496 LWC
1497 #Dependency
1498
Akron821db3d2017-04-06 21:19:31 +02001499 Malt
1500 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001501
Akron821db3d2017-04-06 21:19:31 +02001502 MarMoT
1503 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001504
Akron821db3d2017-04-06 21:19:31 +02001505 Mate
1506 #Dependency
1507 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001508
Akron821db3d2017-04-06 21:19:31 +02001509 MDParser
1510 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001511
Akrone85a7762022-07-22 08:05:03 +02001512 NKJP
1513 #Morpho
1514 #NamedEntities
1515
Akron821db3d2017-04-06 21:19:31 +02001516 OpenNLP
1517 #Morpho
1518 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001519
Akron07e24772020-04-23 14:00:54 +02001520 RWK
1521 #Morpho
1522 #Structure
1523
Akron821db3d2017-04-06 21:19:31 +02001524 Sgbr
1525 #Lemma
1526 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001527
Marc Kupietzb8c53822024-03-16 18:54:08 +01001528 Spacy
1529 #Morpho
1530
Akron7d5e6382019-08-08 16:36:27 +02001531 Talismane
1532 #Dependency
1533 #Morpho
1534
Akron821db3d2017-04-06 21:19:31 +02001535 TreeTagger
1536 #Morpho
1537 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001538
Akron83aedd32023-02-07 10:57:41 +01001539 UDPipe
1540 #Dependency
1541 #Morpho
1542
Akron821db3d2017-04-06 21:19:31 +02001543 XIP
1544 #Constituency
1545 #Morpho
1546 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001547
Akronc13a1702016-03-15 19:33:14 +01001548
1549More importers are in preparation.
1550New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1551See the built-in annotation importers as examples.
1552
Akronf73ffb62018-06-27 12:13:59 +02001553
Akron41e6c8b2021-10-14 20:22:18 +02001554=head1 METADATA SUPPORT
1555
1556L<KorAP::XML::Krill> has built-in importer for some meta data variants
1557developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1558
1559=over 2
1560
1561=item I5 - Meta data for all I5 files
1562
1563=item Sgbr - Meta data from the Schreibgebrauch project
1564
1565=item Gingko - Meta data from the Gingko project in addition to I5
1566
Akron2532f1b2023-05-15 13:41:24 +02001567=item ICC - Meta data for the ICC in addition to I5
1568
Akron41e6c8b2021-10-14 20:22:18 +02001569=back
1570
1571More importers are in preparation.
1572New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1573See the built-in meta data importers as examples.
1574
1575
Akron8f69d632020-01-15 16:58:11 +01001576=head1 About KorAP-XML
1577
1578KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1579data model (Bański et al. 2013), where text data are stored physically
1580separated from their interpretations (i.e. annotations).
1581A text document in KorAP-XML therefore consists of several files
1582containing primary data, metadata and annotations.
1583
1584The structure of a single KorAP-XML document can be as follows:
1585
1586 - data.xml
1587 - header.xml
1588 + base
1589 - tokens.xml
1590 - ...
1591 + struct
1592 - structure.xml
1593 - ...
1594 + corenlp
1595 - morpho.xml
1596 - constituency.xml
1597 - ...
1598 + tree_tagger
1599 - morpho.xml
1600 - ...
1601 - ...
1602
1603The C<data.xml> contains the primary data, the C<header.xml> contains
1604the metadata, and the annotation layers are stored in subfolders
1605like C<base>, C<struct> or C<corenlp>
1606(so-called "foundries"; Bański et al. 2013).
1607
1608Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001609(Lüngen and Sperberg-McQueen 2012). See the documentation in
1610L<KorAP::XML::Meta::I5> for translatable fields.
1611
1612Annotations correspond to a variant of the TEI-P5 feature structures
1613(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001614Annotation feature structures refer to character sequences of the primary text
1615inside the C<text> element of the C<data.xml>.
1616A single annotation containing the lemma of a token can have the following structure:
1617
1618 <span from="0" to="3">
1619 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1620 <f name="lex">
1621 <fs>
1622 <f name="lemma">zum</f>
1623 </fs>
1624 </f>
1625 </fs>
1626 </span>
1627
1628The C<from> and C<to> attributes are refering to the character span
1629in the primary text.
1630Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1631the structure may vary. See L<KorAP::XML::Annotation::*> for various
1632annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001633
1634Multiple KorAP-XML documents are organized on three levels following
1635the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1636corpus E<gt> document E<gt> text. On each level metadata information
1637can be stored, that C<korapxml2krill> will merge to a single metadata
1638object per text. A corpus is therefore structured as follows:
1639
1640 + <corpus>
1641 - header.xml
1642 + <document>
1643 - header.xml
1644 + <text>
1645 - data.xml
1646 - header.xml
1647 - ...
1648 - ...
1649
1650A single text can be identified by the concatenation of
1651the corpus identifier, the document identifier and the text identifier.
1652This identifier is called the text sigle
1653(e.g. a text with the identifier C<18486> in the document C<060> in the
1654corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1655
1656These corpora are often stored in zip files, with which C<korapxml2krill>
1657can deal with. Corpora may also be split in multiple zip archives
1658(e.g. one zip file per foundry), which is also supported (see C<--input>).
1659
1660Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1661in form of a test suite.
1662The resulting JSON format merges all annotation layers
1663based on a single token stream.
1664
1665=head2 References
1666
1667Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1668KorAP data model: first approximation, December.
1669
1670Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1671"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1672Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1673L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1674
1675Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1676"Robust corpus architecture: a new look at virtual collections and data access",
1677Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1678L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1679
1680Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1681Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1682"Towards an international standard on featurestructure representation",
1683Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1684pp. 373-376.
1685L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1686
1687Harald Lüngen and C. M. Sperberg-McQueen (2012):
1688"A TEI P5 Document Grammar for the IDS Text Model",
1689Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1690L<PDF|https://journals.openedition.org/jtei/pdf/508>
1691
1692TEI Consortium, eds:
1693"Feature Structures",
1694Guidelines for Electronic Text Encoding and Interchange.
1695L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1696
Akron941c1a62016-02-23 17:41:41 +01001697=head1 AVAILABILITY
1698
1699 https://github.com/KorAP/KorAP-XML-Krill
1700
1701
1702=head1 COPYRIGHT AND LICENSE
1703
Akrona3518372024-01-22 23:29:00 +01001704Copyright (C) 2015-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001705
Akron6882d7d2021-02-08 09:43:57 +01001706Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001707
Akron29128262024-04-17 15:50:36 +02001708Contributor: Eliza Margaretha, Marc Kupietz
Akron941c1a62016-02-23 17:41:41 +01001709
Akron6882d7d2021-02-08 09:43:57 +01001710L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001711Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001712L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001713member of the
Akronf1849aa2019-12-16 23:35:33 +01001714L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001715
1716This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001717L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001718
1719=cut