blob: 0856c57d3f367cefe6608d55de7d383b8625ebce [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akrona472a242023-02-13 13:46:30 +0100170#
171# 2023/02/13
172# - Fix temporary-extract handling from configuration file.
173#
Akron941c1a62016-02-23 17:41:41 +0100174# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100175
Akrona472a242023-02-13 13:46:30 +0100176our $LAST_CHANGE = '2023/02/13';
Akron941c1a62016-02-23 17:41:41 +0100177our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100178our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100179our $VERSION_MSG = <<"VERSION";
180Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
181VERSION
182
Akron941c1a62016-02-23 17:41:41 +0100183# Parse comand
184my $cmd;
185our @ARGV;
186if ($ARGV[0] && index($ARGV[0], '-') != 0) {
187 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100188};
Akron63f20d42017-04-10 23:40:29 +0200189my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100190
Akron5f51d422016-08-16 16:26:43 +0200191my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200192
193# Configuration hash
194my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100195
Akron941c1a62016-02-23 17:41:41 +0100196# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000197GetOptions(
Akron08385f62016-03-22 20:37:04 +0100198 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200199 'input-base|ib=s' => \($cfg{input_base}),
200 'output|o=s' => \($cfg{output}),
201 'overwrite|w' => \($cfg{overwrite}),
202 'meta|m=s' => \($cfg{meta}),
203 'token|t=s' => \($cfg{token}),
204 'base-sentences|bs=s' => \($cfg{base_sentences}),
205 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
206 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
207 'gzip|z' => \($cfg{gzip}),
Akrona472a242023-02-13 13:46:30 +0100208 'temporary-extract|te=s' => \($cfg{temporary_extract}),
Akrone10ad322016-02-27 10:54:26 +0100209 'skip|s=s' => \@skip,
210 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200211 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200212 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200213 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200214 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200215 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200216 'primary|p!' => sub {
217 warn 'Primary flag no longer supported!';
218 },
Akron6aed0562020-08-07 16:46:00 +0200219 'pretty|y' => sub {
220 warn 'Pretty flag no longer supported!';
221 },
Akronf8df2162020-08-07 15:03:39 +0200222 'jobs|j=i' => \($cfg{jobs}),
223 'koral|k=f' => \($cfg{koral}),
224 'to-tar' => \($cfg{to_tar}),
225 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
226 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
227 'sequential-extraction|se' => \($cfg{sequential_extraction}),
228 'cache-size|cs=s' => \($cfg{cache_size}),
229 'cache-delete|cd!' => \($cfg{cache_delete}),
230 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100231 'help|h' => sub {
232 pod2usage(
233 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200234 -verbose => 99,
235 -msg => $VERSION_MSG,
236 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100237 );
238 },
239 'version|v' => sub {
240 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200241 -verbose => 0,
242 -msg => $VERSION_MSG,
243 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100244 )
245 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000246);
247
Akrone512b7c2020-08-07 16:16:12 +0200248my %ERROR_HASH = (
249 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
250 -verbose => 99,
251 -msg => $VERSION_MSG,
252 -output => '-',
253 -exit => 1
254);
Akron63f20d42017-04-10 23:40:29 +0200255
Akronf8df2162020-08-07 15:03:39 +0200256# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200257if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200258 my %config;
259
Akronf8df2162020-08-07 15:03:39 +0200260 print "Reading config from $cfg_file\n";
261
Akron636aa112017-04-07 18:48:56 +0200262 Config::Simple->import_from($cfg_file, \%config);
263
Akronf8df2162020-08-07 15:03:39 +0200264 foreach (qw!output cache-size input-base token overwrite
265 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200266 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100267 non-verbal-tokens sequential-extraction
268 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200269 koral extract-dir jobs!) {
270 my $underlined = $_ =~ tr/-/_/r;
271 if (!defined($cfg{$underlined}) && defined $config{$_}) {
272 $cfg{$underlined} = $config{$_};
273 };
Akron636aa112017-04-07 18:48:56 +0200274 };
275
276 # Skip
277 if (!scalar(@skip) && defined $config{'skip'}) {
278 @skip = split /\s*;\s*/, $config{'skip'} ;
279 };
280
281 # Sigle
282 if (!scalar(@sigle) && defined $config{'sigle'}) {
283 @sigle = split /\s*;\s*/, $config{'sigle'} ;
284 };
285
286 # Anno
287 if (!scalar(@anno) && defined $config{'anno'}) {
288 @anno = split /\s*;\s*/, $config{'anno'} ;
289 };
290};
291
Akronf8df2162020-08-07 15:03:39 +0200292# Init variables and set default values
293my $output = $cfg{output};
294my $input_base = $cfg{input_base};
295my $gzip = $cfg{gzip};
296my $to_tar = $cfg{to_tar};
Akrona472a242023-02-13 13:46:30 +0100297my $extract_dir = $cfg{temporary_extract};
Akronf8df2162020-08-07 15:03:39 +0200298my $token_base = $cfg{token} // 'OpenNLP#tokens';
299my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
300my $jobs = $cfg{jobs} // 0;
301my $cache_delete = $cfg{cache_delete} // 1;
302my $base_sentences = lc($cfg{base_sentences} // '');
303my $base_paragraphs = lc($cfg{base_paragraphs} // '');
304my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
305my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200306
Akronf8df2162020-08-07 15:03:39 +0200307# Get tokenization basis
308my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200309
Akronf8df2162020-08-07 15:03:39 +0200310# Remove file extension
311$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100312
Akronf8df2162020-08-07 15:03:39 +0200313# Convert sigle to path construct
314s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
315
316my %skip;
317$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200318
Akronb9c33812020-10-21 16:19:35 +0200319Log::Any::Adapter->set(
320 'Stderr', log_level => uc($cfg{log} // 'ERROR')
321);
Akron63f20d42017-04-10 23:40:29 +0200322
Akron84b53ad2022-01-14 12:39:15 +0100323# Start log slimming
324if ($cmd && $cmd eq 'slimlog') {
325 require KorAP::XML::Log::Slim;
326
327 my $log_file = shift @ARGV;
328
329 if (-e $log_file) {
330
331 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
332
333 # Run log filter
334 $slimmer->slim_to;
335 }
336
337 else {
338 warn "Log file can't be found";
339 exit(1);
340 };
341
342 exit;
343};
344
345
Akronf8df2162020-08-07 15:03:39 +0200346if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
347 $log->error("Directory '$output' does not exist.");
348 exit 1;
349};
Akron63f20d42017-04-10 23:40:29 +0200350
Akron941c1a62016-02-23 17:41:41 +0100351# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100352pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000353
Akrone1dbc382016-07-08 22:24:52 +0200354# Gzip has no effect, if no output is given
355pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000356
Akron63f20d42017-04-10 23:40:29 +0200357# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200358if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200359
Akron63f20d42017-04-10 23:40:29 +0200360 # Remove all inputs
361 my $remove_next = 0;
362 @keep_argv = @{c(@keep_argv)->grep(
363 sub {
364 # Input flag
365 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
366 $remove_next = 1;
367 return 0;
368 }
369
370 # input value
371 elsif ($remove_next) {
372 $remove_next = 0;
373 return 0;
374 };
375
376 # Pass parameter
377 return 1;
378 }
379 )->to_array};
380
381
382 # Iterate over all inputs
383 foreach (@input) {
384
Akron081639e2017-04-21 19:01:39 +0200385 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200386 my $new_out = catdir($output, get_file_name_from_glob($_));
387
Akron486f9ab2017-04-22 23:25:19 +0200388 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200389 unless ($to_tar) {
390 if (make_path($new_out) == 0 && !-d $new_out) {
391 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200392 exit 1;
Akron081639e2017-04-21 19:01:39 +0200393 };
Akron63f20d42017-04-10 23:40:29 +0200394 };
395
396 # Create archive command
397 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
398 print "Start serial processing of $_ to $new_out\n";
Akrona472a242023-02-13 13:46:30 +0100399 print 'Command: ', join(' ', @archive_cmd), "\n";
Akron63f20d42017-04-10 23:40:29 +0200400
401 # Start archiving
402 system @archive_cmd;
403 };
404
Akron3abc03e2017-06-29 16:23:35 +0200405 exit;
Akron63f20d42017-04-10 23:40:29 +0200406};
407
Akron5c602cb2020-08-07 17:00:52 +0200408# Define supported (and preinstalled) transformation modules
409my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100410push(@layers, ['Base', 'Sentences']) unless $base_sentences;
411push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200412
413# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200414push(@layers, ['Connexor', 'Morpho'],
415 ['Connexor', 'Syntax'],
416 ['Connexor', 'Phrase'],
417 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200418
419# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200420push(@layers,
421 ['CoreNLP', 'NamedEntities'],
422 ['CoreNLP', 'Sentences'],
423 ['CoreNLP', 'Morpho'],
424 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200425
Akronce125b62017-06-19 11:54:36 +0200426# CMC
427push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100428
Akrone1dbc382016-07-08 22:24:52 +0200429# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100430my @dereko_attr = ();
431if ($base_sentences eq 'dereko#structure') {
432 push @dereko_attr, 'sentences';
433};
434if ($base_paragraphs eq 'dereko#structure') {
435 push @dereko_attr, 'paragraphs';
436};
Akron636bd9c2017-02-09 17:13:00 +0100437
Akron41ac10b2017-02-08 22:47:25 +0100438if ($base_pagebreaks eq 'dereko#structure') {
439 push @dereko_attr, 'pagebreaks';
440};
441
442if ($dereko_attr[0]) {
443 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100444}
445else {
446 push(@layers, ['DeReKo', 'Structure']);
447};
Akrone1dbc382016-07-08 22:24:52 +0200448
Akron57510c12019-01-04 14:58:53 +0100449# DGD
450push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100451if ($base_sentences eq 'dgd#structure') {
452 push(@layers, ['DGD', 'Structure', 'base-sentence']);
453}
Akron57510c12019-01-04 14:58:53 +0100454
455# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200456push(@layers,
457 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100458
Akronabb36902021-10-11 15:51:06 +0200459# Gingko
460push(@layers,
461 ['Gingko', 'Morpho']);
462
Akrone1dbc382016-07-08 22:24:52 +0200463# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200464push(@layers,
465 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200466
Akronea1aed52018-07-19 14:43:34 +0200467# HNC
Akron5c602cb2020-08-07 17:00:52 +0200468push(@layers,
469 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200470
Akron4c679192018-01-16 17:41:49 +0100471# LWC
Akron5c602cb2020-08-07 17:00:52 +0200472push(@layers,
473 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100474
Akrone1dbc382016-07-08 22:24:52 +0200475# Malt
Akron5c602cb2020-08-07 17:00:52 +0200476push(@layers,
477 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200478
Akron57510c12019-01-04 14:58:53 +0100479# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200480push(@layers,
481 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200482
483# Mate
Akron5c602cb2020-08-07 17:00:52 +0200484push(@layers,
485 ['Mate', 'Morpho'],
486 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200487
Akron57510c12019-01-04 14:58:53 +0100488# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200489push(@layers,
490 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100491
Akron88d063a2022-03-21 15:10:01 +0100492# NKJP
493push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200494 ['NKJP', 'Morpho'],
495 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100496
Akrone1dbc382016-07-08 22:24:52 +0200497# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200498push(@layers,
499 ['OpenNLP', 'Morpho'],
500 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200501
Akron07e24772020-04-23 14:00:54 +0200502# Redewiedergabe
503push(@layers, ['RWK', 'Morpho']);
504if ($base_sentences eq 'rwk#structure') {
505 push(@layers, ['RWK', 'Structure']);
506};
507
Akrone1dbc382016-07-08 22:24:52 +0200508# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200509push(@layers,
510 ['Sgbr', 'Lemma'],
511 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200512
Akron7d5e6382019-08-08 16:36:27 +0200513# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200514push(@layers,
515 ['Talismane', 'Dependency'],
516 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200517
Akrone1dbc382016-07-08 22:24:52 +0200518# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200519push(@layers,
520 ['TreeTagger', 'Morpho'],
521 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200522
Marc Kupietz400590b2022-12-23 16:02:36 +0100523# UDPipe
524push(@layers,
525 ['UDPipe', 'Morpho'],
526 ['UDPipe', 'Dependency']);
527
Akrone1dbc382016-07-08 22:24:52 +0200528# XIP
Akron5c602cb2020-08-07 17:00:52 +0200529push(@layers,
530 ['XIP', 'Morpho'],
531 ['XIP', 'Constituency'],
532 ['XIP', 'Sentences'],
533 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200534
Akron4fa37c32017-01-20 14:43:10 +0100535
Akrone1dbc382016-07-08 22:24:52 +0200536# Check filters
537my @filtered_anno;
538if ($skip{'#all'}) {
539 foreach (@anno) {
540 push @filtered_anno, [ split('#', $_) ];
541 };
542}
543
544# Add all annotations that are not skipped
545else {
546 # Add to index file - respect skipping
547 foreach my $info (@layers) {
548 # Skip if Foundry or Foundry#Layer should be skipped
549 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
550 push @filtered_anno, $info;
551 };
552 };
553};
554
Akrone1dbc382016-07-08 22:24:52 +0200555
556# TODO: This should not be initialized for batch
557my $cache = Cache::FastMmap->new(
558 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200559 cache_size => ($cfg{cache_size} // '50m'),
560 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200561);
562
Akron03b24db2016-08-16 20:54:32 +0200563# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200564my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200565 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200566 meta_type => $cfg{meta},
567 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200568 foundry => $token_base_foundry,
569 layer => $token_base_layer,
570 gzip => $gzip,
571 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200572 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100573 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200574 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200575 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
576 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200577);
578
Akrone512b7c2020-08-07 16:16:12 +0200579# Auto adjust jobs
580if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100581 my $cores = 1;
582 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
583 $cores = Sys::Info->new->device('CPU')->count;
584 }
585 else {
586 $log->warn("Unable to determine number of cores");
587 };
588
Akrone512b7c2020-08-07 16:16:12 +0200589 $jobs = ceil(5 * $cores);
590 $log->info("Run using $jobs jobs on $cores cores");
591};
592
593
Akron63f20d42017-04-10 23:40:29 +0200594# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200595if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200596
Akron821db3d2017-04-06 21:19:31 +0200597 my @new_input = ();
598
599 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200600 foreach my $wild_card (@input) {
601
602 # Prefix with input root
603 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
604
605 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200606 };
607
Akron63f20d42017-04-10 23:40:29 +0200608 # Sort files by length
609 @input = sort { length($a) <=> length($b) } @new_input;
610
611 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200612};
613
614
Akron941c1a62016-02-23 17:41:41 +0100615# Process a single file
616unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100617 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000618
Akron941c1a62016-02-23 17:41:41 +0100619 BEGIN {
620 $main::TIME = Benchmark->new;
621 $main::LAST_STOP = Benchmark->new;
622 };
623
624 sub stop_time {
625 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200626 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100627 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200628 timestr(timediff($new, $main::LAST_STOP)) .
629 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
630 );
Akron941c1a62016-02-23 17:41:41 +0100631 $main::LAST_STOP = $new;
632 };
633
634 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200635 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100636
Akron7d4cdd82016-08-17 21:39:45 +0200637 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200638 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100639
Akron11c80302016-03-18 19:44:43 +0100640 # Delete cache file
641 unlink($cache_file) if $cache_delete;
642
Akron5f51d422016-08-16 16:26:43 +0200643 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200644 exit;
Akron81500102017-04-07 20:45:44 +0200645};
646
Nils Diewald59094f22014-11-05 18:20:50 +0000647
Akrone10ad322016-02-27 10:54:26 +0100648# Extract XML files
Akron81500102017-04-07 20:45:44 +0200649if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100650
Akrond5643ad2017-07-04 20:27:13 +0200651 # Output is required
652 pod2usage(%ERROR_HASH) unless $output;
653
Akron7d4cdd82016-08-17 21:39:45 +0200654 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200655 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100656
Akron7d4cdd82016-08-17 21:39:45 +0200657 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100658 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200659 $log->error("Unzip is not installed or incompatible.");
660 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100661 };
662
Akronb0c88db2016-06-29 16:33:18 +0200663 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200664 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200665
Akron31a08cb2019-02-20 20:43:26 +0100666 # Will set @sigle
667 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200668
Akrone10ad322016-02-27 10:54:26 +0100669 # Iterate over all given sigles and extract
670 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100671
Akron2812ba22016-10-28 21:55:59 +0200672 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200673
Akron03b24db2016-08-16 20:54:32 +0200674 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200675 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100676
Akron955b75b2019-02-21 14:28:41 +0100677 # TODO:
678 # - prefix???
679 $archive->extract_sigle([$_], $output, $jobs)
680 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200681 );
Akrone10ad322016-02-27 10:54:26 +0100682 print "extracted.\n";
683 };
Akronb0c88db2016-06-29 16:33:18 +0200684 }
Akron7d4cdd82016-08-17 21:39:45 +0200685
686 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200687 else {
688 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200689 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100690 };
691}
692
Akron81500102017-04-07 20:45:44 +0200693
Akron941c1a62016-02-23 17:41:41 +0100694# Process an archive
695elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000696
Akron81500102017-04-07 20:45:44 +0200697 my $archive_output;
698
699 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100700 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200701
702 # Create new archive object
703 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
704
705 # Check zip capabilities
706 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200707 $log->error("Unzip is not installed or incompatible.");
708 exit 1;
Akron81500102017-04-07 20:45:44 +0200709 };
710
711 # Add further annotation archived
712 $archive->attach($_) foreach @input[1..$#input];
713
714 # Create a temporary directory
715 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200716 $extract_dir = tempdir(CLEANUP => 0);
717 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200718 };
719
Akron63f20d42017-04-10 23:40:29 +0200720 # Add some random extra to avoid clashes with multiple archives
721 $extract_dir = catdir($extract_dir, random_string('cccccc'));
722
Akron31a08cb2019-02-20 20:43:26 +0100723 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200724 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akrona472a242023-02-13 13:46:30 +0100725 print "Extract sequentially to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200726 @input = ($extract_dir);
727 }
728 else {
729 $log->error('Unable to extract from primary archive ' . $input[0] .
730 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200731 exit 1;
Akron81500102017-04-07 20:45:44 +0200732 };
733 }
734
735 # Can't create archive object
736 else {
737 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200738 exit 1;
Akron81500102017-04-07 20:45:44 +0200739 };
740 };
741
Akron7d4cdd82016-08-17 21:39:45 +0200742 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100743 my $pool = Parallel::ForkManager->new($jobs);
744
Akron7d4cdd82016-08-17 21:39:45 +0200745 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100746 my $iter = 1; # Current text in process
747
Akronda3097e2017-04-23 19:53:57 +0200748 my $tar_archive;
749 my $output_dir = $output;
750 my $tar_fh;
751
752 # Initialize tar archive
753 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200754
755 # Set output name
756 my $tar_file = $output;
757 unless ($tar_file =~ /\.tar$/) {
758 $tar_file .= '.tar';
759 };
760
761 # Initiate the tar file
762 print "Writing to file $tar_file\n";
763 $tar_fh = IO::File->new($tar_file, 'w');
764 $tar_fh->binmode(1);
765
Akroneb370a02022-02-24 13:33:40 +0100766 # Use tar builder for archiving
767 if (eval("use Archive::Tar::Builder; 1;")) {
768 $tar_archive = Archive::Tar::Builder->new(
769 ignore_errors => 1
770 );
771
772 # Set handle
773 $tar_archive->set_handle($tar_fh);
774 }
775
776 # Fallback solution
777 else {
778 $tar_archive = KorAP::XML::TarBuilder->new(
779 $tar_fh
780 );
781 };
Akronda3097e2017-04-23 19:53:57 +0200782
783 # Output to temporary directory
784 $output_dir = File::Temp->newdir;
785 };
786
Akron941c1a62016-02-23 17:41:41 +0100787 # Report on fork message
788 $pool->run_on_finish (
789 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200790 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100791 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200792
Akron08385f62016-03-22 20:37:04 +0100793 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200794 ($iter++) . "/$count]" .
795 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200796 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200797
798 if (!$code && $to_tar && $data->[2]) {
799 my $filename = $data->[2];
800
801 # Lock filehandle
802 if (flock($tar_fh, LOCK_EX)) {
803
Akron9a062ce2017-07-04 19:12:05 +0200804 my $clean_file = fileparse($filename);
805
Akronda3097e2017-04-23 19:53:57 +0200806 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200807 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200808 unlink $filename;
809
810 # Unlock filehandle
811 flock($tar_fh, LOCK_UN);
812 }
813 else {
814 $log->warn("Unable to add $filename to archive");
815 };
816 };
817
Akron4c0cf312016-10-15 16:42:09 +0200818 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100819 }
820 );
821
822 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200823 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100824 print "Reading data ...\n";
825
Akron7d4cdd82016-08-17 21:39:45 +0200826 # unless (Cache::FastMmap->new(
827 # share_file => $cache_file,
828 # cache_size => $cache_size,
829 # init_file => $cache_init
830 # )) {
831 # print "Unable to intialize cache '$cache_file'\n\n";
832 # exit(1);
833 # };
Akron11c80302016-03-18 19:44:43 +0100834
Akron486f9ab2017-04-22 23:25:19 +0200835
Akron941c1a62016-02-23 17:41:41 +0100836 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100837 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200838 # TODO:
839 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100840 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100841 my @dirs;
842 my $dir;
843
Akron7d4cdd82016-08-17 21:39:45 +0200844 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100845 while (1) {
846 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200847 push @dirs, $dir;
848 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100849 };
850 last unless $it->next;
851 };
852
853 print "Start processing ...\n";
854 $t = Benchmark->new;
855 $count = scalar @dirs;
856
857 DIRECTORY_LOOP:
858 for (my $i = 0; $i < $count; $i++) {
859
Akrone1dbc382016-07-08 22:24:52 +0200860 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200861 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200862 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200863 );
Akron941c1a62016-02-23 17:41:41 +0100864
865 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200866 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200867
Akron13d56622016-10-31 14:54:49 +0100868 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200869 $pool->finish(
870 0,
Akronda3097e2017-04-23 19:53:57 +0200871 [
872 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
873 undef,
874 $filename
875 ]
Akron486f9ab2017-04-22 23:25:19 +0200876 );
Akron3ec48972016-08-17 23:24:52 +0200877 }
878 else {
Akron4c0cf312016-10-15 16:42:09 +0200879 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200880 };
Akron941c1a62016-02-23 17:41:41 +0100881 };
882 }
883
884 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200885 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200886
Akron941c1a62016-02-23 17:41:41 +0100887 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200888 $log->error("Unzip is not installed or incompatible.");
889 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100890 };
891
Akron08385f62016-03-22 20:37:04 +0100892 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200893 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100894
Akron31a08cb2019-02-20 20:43:26 +0100895 # Get sigles to extract
896 my $prefix = set_sigle($archive);
897
Akron941c1a62016-02-23 17:41:41 +0100898 print "Start processing ...\n";
899 $t = Benchmark->new;
900 my @dirs = $archive->list_texts;
901 $count = scalar @dirs;
902
903 ARCHIVE_LOOP:
904 for (my $i = 0; $i < $count; $i++) {
905
906 # Split path information
907 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
908
Akrone1dbc382016-07-08 22:24:52 +0200909 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200910 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200911 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200912 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200913 catfile($corpus, $doc, $text)
914 . '.json' . ($gzip ? '.gz' : '')
915 )
Akrone1dbc382016-07-08 22:24:52 +0200916 );
Akron941c1a62016-02-23 17:41:41 +0100917
918 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200919 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100920
Akron4c0cf312016-10-15 16:42:09 +0200921 # Create temporary file
922 $temp = File::Temp->newdir;
923
Akronbdf434a2016-10-24 17:42:07 +0200924 # TODO: Check if $filename exist at the beginning,
925 # because extraction can be horrible slow!
926
Akron941c1a62016-02-23 17:41:41 +0100927 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100928 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100929
Akron7d4cdd82016-08-17 21:39:45 +0200930 # Create corpus directory
931 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100932
Akron7d4cdd82016-08-17 21:39:45 +0200933 # Temporary directory
934 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100935
Akron7d4cdd82016-08-17 21:39:45 +0200936 # Write file
Akron13d56622016-10-31 14:54:49 +0100937 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200938
Akron4c0cf312016-10-15 16:42:09 +0200939 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100940 $pool->finish(
941 0,
Akronda3097e2017-04-23 19:53:57 +0200942 [
943 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
944 $temp,
945 $filename
946 ]
Akron13d56622016-10-31 14:54:49 +0100947 );
Akron7d4cdd82016-08-17 21:39:45 +0200948 }
949 else {
Akron4c0cf312016-10-15 16:42:09 +0200950 # Delete temporary file
951 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200952 };
Akron941c1a62016-02-23 17:41:41 +0100953 }
Akron7d4cdd82016-08-17 21:39:45 +0200954
955 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100956 else {
Akron4c0cf312016-10-15 16:42:09 +0200957 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100958 };
959 };
960 }
961
962 else {
963 print "Input is neither a directory nor an archive.\n\n";
964 };
965
966 $pool->wait_all_children;
967
Akron11c80302016-03-18 19:44:43 +0100968 # Delete cache file
969 unlink($cache_file) if $cache_delete;
970
Akronda3097e2017-04-23 19:53:57 +0200971 # Close tar filehandle
972 if ($to_tar && $tar_fh) {
973 $tar_archive->finish;
974 $tar_fh->close;
975 print "Wrote to tar archive.\n";
976 };
977
Akron63f20d42017-04-10 23:40:29 +0200978 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100979 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200980};
Akron941c1a62016-02-23 17:41:41 +0100981
Nils Diewald2db9ad02013-10-29 19:26:43 +0000982
Akron31a08cb2019-02-20 20:43:26 +0100983# For an archive, this will create the list
984# of all sigles to process
985sub set_sigle {
986 my $archive = shift;
987
988 my $prefix = 1;
989 my @dirs = ();
990
991 # No sigles given
992 unless (@sigle) {
993
994 # Get files
995 foreach ($archive->list_texts) {
996
997 push @dirs, $_;
998
999 # Split path information
1000 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1001
1002 # TODO: Make this OS independent
1003 push @sigle, join '/', $corpus, $doc, $text;
1004 };
1005 }
1006
1007 # Check sigle for doc sigles
1008 else {
1009 my @new_sigle;
1010
1011 my $prefix_check = 0;
1012
1013 # Iterate over all sigle
1014 foreach (@sigle) {
1015
1016 # Sigle is a doc sigle
1017 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1018
1019 print "$_ ...";
1020 # Check if a prefix is needed
1021 unless ($prefix_check) {
1022
1023 if ($prefix = $archive->check_prefix) {
1024 print " with prefix ...";
1025 };
1026 $prefix_check = 1;
1027 };
1028
1029 print "\n";
1030
Akron31a08cb2019-02-20 20:43:26 +01001031 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001032 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1033 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001034 );
1035 print "extracted.\n";
1036 }
1037
1038 # Sigle is a text sigle
1039 else {
1040 push @new_sigle, $_;
1041
1042 unless ($prefix_check) {
1043
1044 if ($prefix = $archive->check_prefix) {
1045 print " with prefix ...";
1046 };
1047 $prefix_check = 1;
1048 };
1049 };
1050 };
1051 @sigle = @new_sigle;
1052 };
1053
1054 return $prefix;
1055};
1056
1057
Akron63f20d42017-04-10 23:40:29 +02001058# Cleanup temporary extraction directory
1059if ($extract_dir) {
1060 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001061 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001062};
1063
1064
1065print "\n";
1066
Nils Diewald2db9ad02013-10-29 19:26:43 +00001067__END__
Akron941c1a62016-02-23 17:41:41 +01001068
1069=pod
1070
1071=encoding utf8
1072
1073=head1 NAME
1074
Akron42f48c12020-02-14 13:08:13 +01001075korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001076
1077
1078=head1 SYNOPSIS
1079
Akrona76d8352016-10-27 16:27:32 +02001080 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001081
Akron2fd402b2016-10-27 21:26:48 +02001082
Akron941c1a62016-02-23 17:41:41 +01001083=head1 DESCRIPTION
1084
1085L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1086compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001087The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001088
1089
1090=head1 INSTALLATION
1091
1092The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1093
Akronaf386982016-10-12 00:33:25 +02001094 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001095
Akronc13a1702016-03-15 19:33:14 +01001096In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001097be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001098Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akroneb370a02022-02-24 13:33:40 +01001099Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1100Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001101In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001102
1103=head1 ARGUMENTS
1104
Akrona76d8352016-10-27 16:27:32 +02001105 $ korapxml2krill -z --input <directory> --output <filename>
1106
1107Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001108It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001109
Akron941c1a62016-02-23 17:41:41 +01001110=over 2
1111
1112=item B<archive>
1113
Akron081639e2017-04-21 19:01:39 +02001114 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001115
Akron2fd402b2016-10-27 21:26:48 +02001116Converts an archive of KorAP-XML documents. It expects a directory
1117(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001118
1119=item B<extract>
1120
Akrona76d8352016-10-27 16:27:32 +02001121 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1122
1123Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001124
Akron63f20d42017-04-10 23:40:29 +02001125=item B<serial>
1126
1127 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1128
1129Convert archives sequentially. The inputs are not merged but treated
1130as they are (so they may be premerged or globs).
1131the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001132are created based on the archive name. In case the C<--to-tar> flag is given,
1133the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001134
1135
Akron84b53ad2022-01-14 12:39:15 +01001136=item B<slimlog>
1137
1138 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1139
1140Filters out all useless aka succesfull information from logs, to simplify
1141log checks. Expects no further options.
1142
1143
Akron941c1a62016-02-23 17:41:41 +01001144=back
1145
1146
1147=head1 OPTIONS
1148
1149=over 2
1150
Akrona76d8352016-10-27 16:27:32 +02001151=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001152
Akrona76d8352016-10-27 16:27:32 +02001153Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001154
Akron7606afa2016-10-25 16:23:49 +02001155Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001156document, while C<archive> expects a KorAP-XML corpus folder or a zip
1157file to batch process multiple files.
1158C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001159
Akrona76d8352016-10-27 16:27:32 +02001160C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001161that the first archive listed contains all primary data files
1162and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001163
Akron7606afa2016-10-25 16:23:49 +02001164 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001165
Akron821db3d2017-04-06 21:19:31 +02001166Input may also be defined using BSD glob wildcards.
1167
1168 -i 'file/news*.zip'
1169
1170The extended input array will be sorted in length order, so the shortest
1171path needs to contain all primary data files and all meta data files.
1172
Akron0c3e3752016-06-28 15:55:53 +02001173(The directory structure follows the base directory format,
1174that may include a C<.> root folder.
1175In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001176need to be passed with a hash sign in front of the archive's name.
1177This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001178
Akron7606afa2016-10-25 16:23:49 +02001179To support zip files, a version of C<unzip> needs to be installed that is
1180compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001181
Akron7606afa2016-10-25 16:23:49 +02001182B<The root folder switch using the hash sign is experimental and
1183may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001184
Akronf73ffb62018-06-27 12:13:59 +02001185
Akron63f20d42017-04-10 23:40:29 +02001186=item B<--input-base|-ib> <directory>
1187
1188The base directory for inputs.
1189
1190
Akron941c1a62016-02-23 17:41:41 +01001191=item B<--output|-o> <directory|file>
1192
1193Output folder for archive processing or
1194document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001195writes to C<STDOUT> by default
1196(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001197
1198=item B<--overwrite|-w>
1199
1200Overwrite files that already exist.
1201
Akronf73ffb62018-06-27 12:13:59 +02001202
Akron3741f8b2016-12-21 19:55:21 +01001203=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001204
1205Define the default tokenization by specifying
1206the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001207of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001208This will directly take the file instead of running
1209the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001210
Akron3741f8b2016-12-21 19:55:21 +01001211
1212=item B<--base-sentences|-bs> <foundry>#<layer>
1213
1214Define the layer for base sentences.
1215If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001216Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1217layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001218
1219 Defaults to unset.
1220
1221
1222=item B<--base-paragraphs|-bp> <foundry>#<layer>
1223
1224Define the layer for base paragraphs.
1225If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001226Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1227layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001228
1229 Defaults to unset.
1230
1231
Akron41ac10b2017-02-08 22:47:25 +01001232=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1233
1234Define the layer for base pagebreaks.
1235Currently C<DeReKo#Structure> is the only layer supported.
1236
1237 Defaults to unset.
1238
1239
Akron941c1a62016-02-23 17:41:41 +01001240=item B<--skip|-s> <foundry>[#<layer>]
1241
Akronf7ad89e2016-03-16 18:22:47 +01001242Skip specific annotations by specifying the foundry
1243(and optionally the layer with a C<#>-prefix),
1244e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001245Can be set multiple times.
1246
Akronf73ffb62018-06-27 12:13:59 +02001247
Akronc13a1702016-03-15 19:33:14 +01001248=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001249
Akronf7ad89e2016-03-16 18:22:47 +01001250Convert specific annotations by specifying the foundry
1251(and optionally the layer with a C<#>-prefix),
1252e.g. C<Mate> or C<Mate#Morpho>.
1253Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001254
Akronf73ffb62018-06-27 12:13:59 +02001255
Akroned9baf02019-01-22 17:03:25 +01001256=item B<--non-word-tokens|-nwt>
1257
1258Tokenize non-word tokens like word tokens (defined as matching
1259C</[\d\w]/>). Useful to treat punctuations as tokens.
1260
1261 Defaults to unset.
1262
Akronf1849aa2019-12-16 23:35:33 +01001263
1264=item B<--non-verbal-tokens|-nvt>
1265
1266Tokenize non-verbal tokens marked as in the primary data as
1267the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1268
1269 Defaults to unset.
1270
1271
Akron941c1a62016-02-23 17:41:41 +01001272=item B<--jobs|-j>
1273
1274Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001275for archive processing.
Akron11c80302016-03-18 19:44:43 +01001276Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001277
Akrona472a242023-02-13 13:46:30 +01001278If C<sequential-extraction> is not set to true, this will
Akron9ec88872017-04-12 16:29:06 +02001279also apply to extraction.
1280
Akronc11f7982017-02-21 21:20:14 +01001281Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001282times the number of available cores, in case L<Sys::Info>
1283is available.
Akronf7ad89e2016-03-16 18:22:47 +01001284This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001285
Akronf73ffb62018-06-27 12:13:59 +02001286
Akron263274c2019-02-07 09:48:30 +01001287=item B<--koral|-k>
1288
1289Version of the output format. Supported versions are:
1290C<0> for legacy serialization, C<0.03> for serialization
1291with metadata fields as key-values on the root object,
1292C<0.4> for serialization with metadata fields as a list
1293of C<"@type":"koral:field"> objects.
1294
1295Currently defaults to C<0.03>.
1296
1297
Akron9ec88872017-04-12 16:29:06 +02001298=item B<--sequential-extraction|-se>
1299
1300Flag to indicate, if the C<jobs> value also applies to extraction.
1301Some systems may have problems with extracting multiple archives
1302to the same folder at the same time.
1303Can be flagged using C<--no-sequential-extraction> as well.
1304Defaults to C<false>.
1305
Akronf73ffb62018-06-27 12:13:59 +02001306
Akron35db6e32016-03-17 22:42:22 +01001307=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001308
Akron35db6e32016-03-17 22:42:22 +01001309Define the metadata parser to use. Defaults to C<I5>.
1310Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1311This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001312
Akronf73ffb62018-06-27 12:13:59 +02001313
Akron941c1a62016-02-23 17:41:41 +01001314=item B<--gzip|-z>
1315
Akronf7ad89e2016-03-16 18:22:47 +01001316Compress the output.
1317Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001318
Akronf73ffb62018-06-27 12:13:59 +02001319
Akron11c80302016-03-18 19:44:43 +01001320=item B<--cache|-c>
1321
1322File to mmap a cache (using L<Cache::FastMmap>).
1323Defaults to C<korapxml2krill.cache> in the calling directory.
1324
Akronf73ffb62018-06-27 12:13:59 +02001325
Akron11c80302016-03-18 19:44:43 +01001326=item B<--cache-size|-cs>
1327
1328Size of the cache. Defaults to C<50m>.
1329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akron11c80302016-03-18 19:44:43 +01001331=item B<--cache-init|-ci>
1332
1333Initialize cache file.
1334Can be flagged using C<--no-cache-init> as well.
1335Defaults to C<true>.
1336
Akronf73ffb62018-06-27 12:13:59 +02001337
Akron11c80302016-03-18 19:44:43 +01001338=item B<--cache-delete|-cd>
1339
1340Delete cache file after processing.
1341Can be flagged using C<--no-cache-delete> as well.
1342Defaults to C<true>.
1343
Akronf73ffb62018-06-27 12:13:59 +02001344
Akron636aa112017-04-07 18:48:56 +02001345=item B<--config|-cfg>
1346
1347Configure the parameters of your call in a file
1348of key-value pairs with whitespace separator
1349
1350 overwrite 1
1351 token DeReKo#Structure
1352 ...
1353
1354Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001355C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001356C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001357C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001358C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001359C<base-sentences>, C<base-paragraphs>,
1360C<base-pagebreaks>,
1361C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001362(semicolon separated), C<anno> (semicolon separated).
1363
Akronf73ffb62018-06-27 12:13:59 +02001364Configuration parameters will always be overwritten by
1365passed parameters.
1366
1367
Akron81500102017-04-07 20:45:44 +02001368=item B<--temporary-extract|-te>
1369
Akrona472a242023-02-13 13:46:30 +01001370Only valid for the C<archive> and C<serial>
1371commands.
Akron81500102017-04-07 20:45:44 +02001372
1373This will first extract all files into a
1374directory and then will archive.
1375If the directory is given as C<:temp:>,
1376a temporary directory is used.
1377This is especially useful to avoid
1378massive unzipping and potential
1379network latency.
Akron636aa112017-04-07 18:48:56 +02001380
Akronf73ffb62018-06-27 12:13:59 +02001381
Akronc93a0802019-07-11 15:48:34 +02001382=item B<--to-tar>
1383
1384Only valid for the C<archive> command.
1385
1386Writes the output into a tar archive.
1387
1388
Akrone10ad322016-02-27 10:54:26 +01001389=item B<--sigle|-sg>
1390
Akron20807582016-10-26 17:11:34 +02001391Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001392Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001393I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001394Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001395In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001396On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001397
Akron64f7fae2022-07-27 12:45:33 +02001398=item B<--lang>
1399
1400Preferred language for metadata fields. In case multiple titles are
1401given (on any level) with different C<xml:lang> attributes,
1402the language given is preferred.
1403Because titles may have different sources and different priorities,
1404non-specific language titles may still be preferred in case the title
1405source has a higher priority.
1406
Akronf73ffb62018-06-27 12:13:59 +02001407
Akron941c1a62016-02-23 17:41:41 +01001408=item B<--log|-l>
1409
Akronb9c33812020-10-21 16:19:35 +02001410The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001411
Akronf73ffb62018-06-27 12:13:59 +02001412
Akron941c1a62016-02-23 17:41:41 +01001413=item B<--help|-h>
1414
Akron42f48c12020-02-14 13:08:13 +01001415Print help information.
Akron941c1a62016-02-23 17:41:41 +01001416
Akronf73ffb62018-06-27 12:13:59 +02001417
Akron941c1a62016-02-23 17:41:41 +01001418=item B<--version|-v>
1419
1420Print version information.
1421
1422=back
1423
Akronf73ffb62018-06-27 12:13:59 +02001424
Akronc13a1702016-03-15 19:33:14 +01001425=head1 ANNOTATION SUPPORT
1426
1427L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1428developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1429The base foundry with paragraphs, sentences, and the text element are mandatory for
1430L<Krill|https://github.com/KorAP/Krill>.
1431
Akron821db3d2017-04-06 21:19:31 +02001432 Base
1433 #Paragraphs
1434 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001435
Akron821db3d2017-04-06 21:19:31 +02001436 Connexor
1437 #Morpho
1438 #Phrase
1439 #Sentences
1440 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001441
Akron821db3d2017-04-06 21:19:31 +02001442 CoreNLP
1443 #Constituency
1444 #Morpho
1445 #NamedEntities
1446 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001447
Akronce125b62017-06-19 11:54:36 +02001448 CMC
1449 #Morpho
1450
Akron821db3d2017-04-06 21:19:31 +02001451 DeReKo
1452 #Structure
Akronc13a1702016-03-15 19:33:14 +01001453
Akron57510c12019-01-04 14:58:53 +01001454 DGD
1455 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001456 #Structure
Akron57510c12019-01-04 14:58:53 +01001457
Akron821db3d2017-04-06 21:19:31 +02001458 DRuKoLa
1459 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001460
Akron821db3d2017-04-06 21:19:31 +02001461 Glemm
1462 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001463
Akronabb36902021-10-11 15:51:06 +02001464 Gingko
1465 #Morpho
1466
Akronea1aed52018-07-19 14:43:34 +02001467 HNC
1468 #Morpho
1469
Akron4c679192018-01-16 17:41:49 +01001470 LWC
1471 #Dependency
1472
Akron821db3d2017-04-06 21:19:31 +02001473 Malt
1474 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001475
Akron821db3d2017-04-06 21:19:31 +02001476 MarMoT
1477 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001478
Akron821db3d2017-04-06 21:19:31 +02001479 Mate
1480 #Dependency
1481 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001482
Akron821db3d2017-04-06 21:19:31 +02001483 MDParser
1484 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001485
Akrone85a7762022-07-22 08:05:03 +02001486 NKJP
1487 #Morpho
1488 #NamedEntities
1489
Akron821db3d2017-04-06 21:19:31 +02001490 OpenNLP
1491 #Morpho
1492 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001493
Akron07e24772020-04-23 14:00:54 +02001494 RWK
1495 #Morpho
1496 #Structure
1497
Akron821db3d2017-04-06 21:19:31 +02001498 Sgbr
1499 #Lemma
1500 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001501
Akron7d5e6382019-08-08 16:36:27 +02001502 Talismane
1503 #Dependency
1504 #Morpho
1505
Akron821db3d2017-04-06 21:19:31 +02001506 TreeTagger
1507 #Morpho
1508 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001509
Akron83aedd32023-02-07 10:57:41 +01001510 UDPipe
1511 #Dependency
1512 #Morpho
1513
Akron821db3d2017-04-06 21:19:31 +02001514 XIP
1515 #Constituency
1516 #Morpho
1517 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001518
Akronc13a1702016-03-15 19:33:14 +01001519
1520More importers are in preparation.
1521New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1522See the built-in annotation importers as examples.
1523
Akronf73ffb62018-06-27 12:13:59 +02001524
Akron41e6c8b2021-10-14 20:22:18 +02001525=head1 METADATA SUPPORT
1526
1527L<KorAP::XML::Krill> has built-in importer for some meta data variants
1528developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1529
1530=over 2
1531
1532=item I5 - Meta data for all I5 files
1533
1534=item Sgbr - Meta data from the Schreibgebrauch project
1535
1536=item Gingko - Meta data from the Gingko project in addition to I5
1537
1538=back
1539
1540More importers are in preparation.
1541New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1542See the built-in meta data importers as examples.
1543
1544
Akron8f69d632020-01-15 16:58:11 +01001545=head1 About KorAP-XML
1546
1547KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1548data model (Bański et al. 2013), where text data are stored physically
1549separated from their interpretations (i.e. annotations).
1550A text document in KorAP-XML therefore consists of several files
1551containing primary data, metadata and annotations.
1552
1553The structure of a single KorAP-XML document can be as follows:
1554
1555 - data.xml
1556 - header.xml
1557 + base
1558 - tokens.xml
1559 - ...
1560 + struct
1561 - structure.xml
1562 - ...
1563 + corenlp
1564 - morpho.xml
1565 - constituency.xml
1566 - ...
1567 + tree_tagger
1568 - morpho.xml
1569 - ...
1570 - ...
1571
1572The C<data.xml> contains the primary data, the C<header.xml> contains
1573the metadata, and the annotation layers are stored in subfolders
1574like C<base>, C<struct> or C<corenlp>
1575(so-called "foundries"; Bański et al. 2013).
1576
1577Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001578(Lüngen and Sperberg-McQueen 2012). See the documentation in
1579L<KorAP::XML::Meta::I5> for translatable fields.
1580
1581Annotations correspond to a variant of the TEI-P5 feature structures
1582(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001583Annotation feature structures refer to character sequences of the primary text
1584inside the C<text> element of the C<data.xml>.
1585A single annotation containing the lemma of a token can have the following structure:
1586
1587 <span from="0" to="3">
1588 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1589 <f name="lex">
1590 <fs>
1591 <f name="lemma">zum</f>
1592 </fs>
1593 </f>
1594 </fs>
1595 </span>
1596
1597The C<from> and C<to> attributes are refering to the character span
1598in the primary text.
1599Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1600the structure may vary. See L<KorAP::XML::Annotation::*> for various
1601annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001602
1603Multiple KorAP-XML documents are organized on three levels following
1604the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1605corpus E<gt> document E<gt> text. On each level metadata information
1606can be stored, that C<korapxml2krill> will merge to a single metadata
1607object per text. A corpus is therefore structured as follows:
1608
1609 + <corpus>
1610 - header.xml
1611 + <document>
1612 - header.xml
1613 + <text>
1614 - data.xml
1615 - header.xml
1616 - ...
1617 - ...
1618
1619A single text can be identified by the concatenation of
1620the corpus identifier, the document identifier and the text identifier.
1621This identifier is called the text sigle
1622(e.g. a text with the identifier C<18486> in the document C<060> in the
1623corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1624
1625These corpora are often stored in zip files, with which C<korapxml2krill>
1626can deal with. Corpora may also be split in multiple zip archives
1627(e.g. one zip file per foundry), which is also supported (see C<--input>).
1628
1629Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1630in form of a test suite.
1631The resulting JSON format merges all annotation layers
1632based on a single token stream.
1633
1634=head2 References
1635
1636Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1637KorAP data model: first approximation, December.
1638
1639Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1640"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1641Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1642L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1643
1644Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1645"Robust corpus architecture: a new look at virtual collections and data access",
1646Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1647L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1648
1649Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1650Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1651"Towards an international standard on featurestructure representation",
1652Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1653pp. 373-376.
1654L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1655
1656Harald Lüngen and C. M. Sperberg-McQueen (2012):
1657"A TEI P5 Document Grammar for the IDS Text Model",
1658Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1659L<PDF|https://journals.openedition.org/jtei/pdf/508>
1660
1661TEI Consortium, eds:
1662"Feature Structures",
1663Guidelines for Electronic Text Encoding and Interchange.
1664L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1665
Akron941c1a62016-02-23 17:41:41 +01001666=head1 AVAILABILITY
1667
1668 https://github.com/KorAP/KorAP-XML-Krill
1669
1670
1671=head1 COPYRIGHT AND LICENSE
1672
Akron83aedd32023-02-07 10:57:41 +01001673Copyright (C) 2015-2023, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001674
Akron6882d7d2021-02-08 09:43:57 +01001675Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001676
Akrona76d8352016-10-27 16:27:32 +02001677Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001678
Akron6882d7d2021-02-08 09:43:57 +01001679L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001680Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001681L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001682member of the
Akronf1849aa2019-12-16 23:35:33 +01001683L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001684
1685This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001686L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001687
1688=cut