blob: f44361524fb195c5edced516f7b2cb618c3302b7 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron941c1a62016-02-23 17:41:41 +0100167# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100168
Akron64f7fae2022-07-27 12:45:33 +0200169our $LAST_CHANGE = '2022/07/27';
Akron941c1a62016-02-23 17:41:41 +0100170our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100171our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100172our $VERSION_MSG = <<"VERSION";
173Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
174VERSION
175
Akron941c1a62016-02-23 17:41:41 +0100176# Parse comand
177my $cmd;
178our @ARGV;
179if ($ARGV[0] && index($ARGV[0], '-') != 0) {
180 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100181};
Akron63f20d42017-04-10 23:40:29 +0200182my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100183
Akron5f51d422016-08-16 16:26:43 +0200184my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200185
186# Configuration hash
187my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100188
Akron941c1a62016-02-23 17:41:41 +0100189# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000190GetOptions(
Akron08385f62016-03-22 20:37:04 +0100191 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200192 'input-base|ib=s' => \($cfg{input_base}),
193 'output|o=s' => \($cfg{output}),
194 'overwrite|w' => \($cfg{overwrite}),
195 'meta|m=s' => \($cfg{meta}),
196 'token|t=s' => \($cfg{token}),
197 'base-sentences|bs=s' => \($cfg{base_sentences}),
198 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
199 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
200 'gzip|z' => \($cfg{gzip}),
201 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100202 'skip|s=s' => \@skip,
203 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200204 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200205 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200206 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200207 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200208 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200209 'primary|p!' => sub {
210 warn 'Primary flag no longer supported!';
211 },
Akron6aed0562020-08-07 16:46:00 +0200212 'pretty|y' => sub {
213 warn 'Pretty flag no longer supported!';
214 },
Akronf8df2162020-08-07 15:03:39 +0200215 'jobs|j=i' => \($cfg{jobs}),
216 'koral|k=f' => \($cfg{koral}),
217 'to-tar' => \($cfg{to_tar}),
218 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
219 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
220 'sequential-extraction|se' => \($cfg{sequential_extraction}),
221 'cache-size|cs=s' => \($cfg{cache_size}),
222 'cache-delete|cd!' => \($cfg{cache_delete}),
223 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100224 'help|h' => sub {
225 pod2usage(
226 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200227 -verbose => 99,
228 -msg => $VERSION_MSG,
229 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100230 );
231 },
232 'version|v' => sub {
233 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200234 -verbose => 0,
235 -msg => $VERSION_MSG,
236 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100237 )
238 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000239);
240
Akrone512b7c2020-08-07 16:16:12 +0200241my %ERROR_HASH = (
242 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
243 -verbose => 99,
244 -msg => $VERSION_MSG,
245 -output => '-',
246 -exit => 1
247);
Akron63f20d42017-04-10 23:40:29 +0200248
Akronf8df2162020-08-07 15:03:39 +0200249# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200250if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200251 my %config;
252
Akronf8df2162020-08-07 15:03:39 +0200253 print "Reading config from $cfg_file\n";
254
Akron636aa112017-04-07 18:48:56 +0200255 Config::Simple->import_from($cfg_file, \%config);
256
Akronf8df2162020-08-07 15:03:39 +0200257 foreach (qw!output cache-size input-base token overwrite
258 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200259 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100260 non-verbal-tokens sequential-extraction
261 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200262 koral extract-dir jobs!) {
263 my $underlined = $_ =~ tr/-/_/r;
264 if (!defined($cfg{$underlined}) && defined $config{$_}) {
265 $cfg{$underlined} = $config{$_};
266 };
Akron636aa112017-04-07 18:48:56 +0200267 };
268
269 # Skip
270 if (!scalar(@skip) && defined $config{'skip'}) {
271 @skip = split /\s*;\s*/, $config{'skip'} ;
272 };
273
274 # Sigle
275 if (!scalar(@sigle) && defined $config{'sigle'}) {
276 @sigle = split /\s*;\s*/, $config{'sigle'} ;
277 };
278
279 # Anno
280 if (!scalar(@anno) && defined $config{'anno'}) {
281 @anno = split /\s*;\s*/, $config{'anno'} ;
282 };
283};
284
Akronf8df2162020-08-07 15:03:39 +0200285# Init variables and set default values
286my $output = $cfg{output};
287my $input_base = $cfg{input_base};
288my $gzip = $cfg{gzip};
289my $to_tar = $cfg{to_tar};
290my $extract_dir = $cfg{extract_dir};
291my $token_base = $cfg{token} // 'OpenNLP#tokens';
292my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
293my $jobs = $cfg{jobs} // 0;
294my $cache_delete = $cfg{cache_delete} // 1;
295my $base_sentences = lc($cfg{base_sentences} // '');
296my $base_paragraphs = lc($cfg{base_paragraphs} // '');
297my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
298my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200299
Akronf8df2162020-08-07 15:03:39 +0200300# Get tokenization basis
301my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200302
Akronf8df2162020-08-07 15:03:39 +0200303# Remove file extension
304$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100305
Akronf8df2162020-08-07 15:03:39 +0200306# Convert sigle to path construct
307s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
308
309my %skip;
310$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200311
Akronb9c33812020-10-21 16:19:35 +0200312Log::Any::Adapter->set(
313 'Stderr', log_level => uc($cfg{log} // 'ERROR')
314);
Akron63f20d42017-04-10 23:40:29 +0200315
Akron84b53ad2022-01-14 12:39:15 +0100316# Start log slimming
317if ($cmd && $cmd eq 'slimlog') {
318 require KorAP::XML::Log::Slim;
319
320 my $log_file = shift @ARGV;
321
322 if (-e $log_file) {
323
324 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
325
326 # Run log filter
327 $slimmer->slim_to;
328 }
329
330 else {
331 warn "Log file can't be found";
332 exit(1);
333 };
334
335 exit;
336};
337
338
Akronf8df2162020-08-07 15:03:39 +0200339if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
340 $log->error("Directory '$output' does not exist.");
341 exit 1;
342};
Akron63f20d42017-04-10 23:40:29 +0200343
Akron941c1a62016-02-23 17:41:41 +0100344# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100345pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000346
Akrone1dbc382016-07-08 22:24:52 +0200347# Gzip has no effect, if no output is given
348pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349
Akron63f20d42017-04-10 23:40:29 +0200350# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200351if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200352
Akron63f20d42017-04-10 23:40:29 +0200353 # Remove all inputs
354 my $remove_next = 0;
355 @keep_argv = @{c(@keep_argv)->grep(
356 sub {
357 # Input flag
358 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
359 $remove_next = 1;
360 return 0;
361 }
362
363 # input value
364 elsif ($remove_next) {
365 $remove_next = 0;
366 return 0;
367 };
368
369 # Pass parameter
370 return 1;
371 }
372 )->to_array};
373
374
375 # Iterate over all inputs
376 foreach (@input) {
377
Akron081639e2017-04-21 19:01:39 +0200378 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200379 my $new_out = catdir($output, get_file_name_from_glob($_));
380
Akron486f9ab2017-04-22 23:25:19 +0200381 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200382 unless ($to_tar) {
383 if (make_path($new_out) == 0 && !-d $new_out) {
384 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200385 exit 1;
Akron081639e2017-04-21 19:01:39 +0200386 };
Akron63f20d42017-04-10 23:40:29 +0200387 };
388
389 # Create archive command
390 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
391 print "Start serial processing of $_ to $new_out\n";
392
393 # Start archiving
394 system @archive_cmd;
395 };
396
Akron3abc03e2017-06-29 16:23:35 +0200397 exit;
Akron63f20d42017-04-10 23:40:29 +0200398};
399
Akron5c602cb2020-08-07 17:00:52 +0200400# Define supported (and preinstalled) transformation modules
401my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100402push(@layers, ['Base', 'Sentences']) unless $base_sentences;
403push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200404
405# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200406push(@layers, ['Connexor', 'Morpho'],
407 ['Connexor', 'Syntax'],
408 ['Connexor', 'Phrase'],
409 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200410
411# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200412push(@layers,
413 ['CoreNLP', 'NamedEntities'],
414 ['CoreNLP', 'Sentences'],
415 ['CoreNLP', 'Morpho'],
416 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200417
Akronce125b62017-06-19 11:54:36 +0200418# CMC
419push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100420
Akrone1dbc382016-07-08 22:24:52 +0200421# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100422my @dereko_attr = ();
423if ($base_sentences eq 'dereko#structure') {
424 push @dereko_attr, 'sentences';
425};
426if ($base_paragraphs eq 'dereko#structure') {
427 push @dereko_attr, 'paragraphs';
428};
Akron636bd9c2017-02-09 17:13:00 +0100429
Akron41ac10b2017-02-08 22:47:25 +0100430if ($base_pagebreaks eq 'dereko#structure') {
431 push @dereko_attr, 'pagebreaks';
432};
433
434if ($dereko_attr[0]) {
435 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100436}
437else {
438 push(@layers, ['DeReKo', 'Structure']);
439};
Akrone1dbc382016-07-08 22:24:52 +0200440
Akron57510c12019-01-04 14:58:53 +0100441# DGD
442push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100443if ($base_sentences eq 'dgd#structure') {
444 push(@layers, ['DGD', 'Structure', 'base-sentence']);
445}
Akron57510c12019-01-04 14:58:53 +0100446
447# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200448push(@layers,
449 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100450
Akronabb36902021-10-11 15:51:06 +0200451# Gingko
452push(@layers,
453 ['Gingko', 'Morpho']);
454
Akrone1dbc382016-07-08 22:24:52 +0200455# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200456push(@layers,
457 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200458
Akronea1aed52018-07-19 14:43:34 +0200459# HNC
Akron5c602cb2020-08-07 17:00:52 +0200460push(@layers,
461 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200462
Akron4c679192018-01-16 17:41:49 +0100463# LWC
Akron5c602cb2020-08-07 17:00:52 +0200464push(@layers,
465 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100466
Akrone1dbc382016-07-08 22:24:52 +0200467# Malt
Akron5c602cb2020-08-07 17:00:52 +0200468push(@layers,
469 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200470
Akron57510c12019-01-04 14:58:53 +0100471# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200472push(@layers,
473 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200474
475# Mate
Akron5c602cb2020-08-07 17:00:52 +0200476push(@layers,
477 ['Mate', 'Morpho'],
478 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200479
Akron57510c12019-01-04 14:58:53 +0100480# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200481push(@layers,
482 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100483
Akron88d063a2022-03-21 15:10:01 +0100484# NKJP
485push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200486 ['NKJP', 'Morpho'],
487 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100488
Akrone1dbc382016-07-08 22:24:52 +0200489# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200490push(@layers,
491 ['OpenNLP', 'Morpho'],
492 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200493
Akron07e24772020-04-23 14:00:54 +0200494# Redewiedergabe
495push(@layers, ['RWK', 'Morpho']);
496if ($base_sentences eq 'rwk#structure') {
497 push(@layers, ['RWK', 'Structure']);
498};
499
Akrone1dbc382016-07-08 22:24:52 +0200500# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200501push(@layers,
502 ['Sgbr', 'Lemma'],
503 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200504
Akron7d5e6382019-08-08 16:36:27 +0200505# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200506push(@layers,
507 ['Talismane', 'Dependency'],
508 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200509
Akrone1dbc382016-07-08 22:24:52 +0200510# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200511push(@layers,
512 ['TreeTagger', 'Morpho'],
513 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200514
Marc Kupietz400590b2022-12-23 16:02:36 +0100515# UDPipe
516push(@layers,
517 ['UDPipe', 'Morpho'],
518 ['UDPipe', 'Dependency']);
519
Akrone1dbc382016-07-08 22:24:52 +0200520# XIP
Akron5c602cb2020-08-07 17:00:52 +0200521push(@layers,
522 ['XIP', 'Morpho'],
523 ['XIP', 'Constituency'],
524 ['XIP', 'Sentences'],
525 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200526
Akron4fa37c32017-01-20 14:43:10 +0100527
Akrone1dbc382016-07-08 22:24:52 +0200528# Check filters
529my @filtered_anno;
530if ($skip{'#all'}) {
531 foreach (@anno) {
532 push @filtered_anno, [ split('#', $_) ];
533 };
534}
535
536# Add all annotations that are not skipped
537else {
538 # Add to index file - respect skipping
539 foreach my $info (@layers) {
540 # Skip if Foundry or Foundry#Layer should be skipped
541 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
542 push @filtered_anno, $info;
543 };
544 };
545};
546
Akrone1dbc382016-07-08 22:24:52 +0200547
548# TODO: This should not be initialized for batch
549my $cache = Cache::FastMmap->new(
550 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200551 cache_size => ($cfg{cache_size} // '50m'),
552 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200553);
554
Akron03b24db2016-08-16 20:54:32 +0200555# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200556my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200557 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200558 meta_type => $cfg{meta},
559 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200560 foundry => $token_base_foundry,
561 layer => $token_base_layer,
562 gzip => $gzip,
563 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200564 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100565 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200566 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200567 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
568 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200569);
570
Akrone512b7c2020-08-07 16:16:12 +0200571# Auto adjust jobs
572if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100573 my $cores = 1;
574 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
575 $cores = Sys::Info->new->device('CPU')->count;
576 }
577 else {
578 $log->warn("Unable to determine number of cores");
579 };
580
Akrone512b7c2020-08-07 16:16:12 +0200581 $jobs = ceil(5 * $cores);
582 $log->info("Run using $jobs jobs on $cores cores");
583};
584
585
Akron63f20d42017-04-10 23:40:29 +0200586# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200587if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200588
Akron821db3d2017-04-06 21:19:31 +0200589 my @new_input = ();
590
591 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200592 foreach my $wild_card (@input) {
593
594 # Prefix with input root
595 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
596
597 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200598 };
599
Akron63f20d42017-04-10 23:40:29 +0200600 # Sort files by length
601 @input = sort { length($a) <=> length($b) } @new_input;
602
603 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200604};
605
606
Akron941c1a62016-02-23 17:41:41 +0100607# Process a single file
608unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100609 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000610
Akron941c1a62016-02-23 17:41:41 +0100611 BEGIN {
612 $main::TIME = Benchmark->new;
613 $main::LAST_STOP = Benchmark->new;
614 };
615
616 sub stop_time {
617 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200618 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100619 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200620 timestr(timediff($new, $main::LAST_STOP)) .
621 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
622 );
Akron941c1a62016-02-23 17:41:41 +0100623 $main::LAST_STOP = $new;
624 };
625
626 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200627 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100628
Akron7d4cdd82016-08-17 21:39:45 +0200629 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200630 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100631
Akron11c80302016-03-18 19:44:43 +0100632 # Delete cache file
633 unlink($cache_file) if $cache_delete;
634
Akron5f51d422016-08-16 16:26:43 +0200635 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200636 exit;
Akron81500102017-04-07 20:45:44 +0200637};
638
Nils Diewald59094f22014-11-05 18:20:50 +0000639
Akrone10ad322016-02-27 10:54:26 +0100640# Extract XML files
Akron81500102017-04-07 20:45:44 +0200641if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100642
Akrond5643ad2017-07-04 20:27:13 +0200643 # Output is required
644 pod2usage(%ERROR_HASH) unless $output;
645
Akron7d4cdd82016-08-17 21:39:45 +0200646 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200647 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100648
Akron7d4cdd82016-08-17 21:39:45 +0200649 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100650 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200651 $log->error("Unzip is not installed or incompatible.");
652 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100653 };
654
Akronb0c88db2016-06-29 16:33:18 +0200655 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200656 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200657
Akron31a08cb2019-02-20 20:43:26 +0100658 # Will set @sigle
659 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200660
Akrone10ad322016-02-27 10:54:26 +0100661 # Iterate over all given sigles and extract
662 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100663
Akron2812ba22016-10-28 21:55:59 +0200664 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200665
Akron03b24db2016-08-16 20:54:32 +0200666 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200667 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100668
Akron955b75b2019-02-21 14:28:41 +0100669 # TODO:
670 # - prefix???
671 $archive->extract_sigle([$_], $output, $jobs)
672 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200673 );
Akrone10ad322016-02-27 10:54:26 +0100674 print "extracted.\n";
675 };
Akronb0c88db2016-06-29 16:33:18 +0200676 }
Akron7d4cdd82016-08-17 21:39:45 +0200677
678 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200679 else {
680 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200681 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100682 };
683}
684
Akron81500102017-04-07 20:45:44 +0200685
Akron941c1a62016-02-23 17:41:41 +0100686# Process an archive
687elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000688
Akron81500102017-04-07 20:45:44 +0200689 my $archive_output;
690
691 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100692 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200693
694 # Create new archive object
695 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
696
697 # Check zip capabilities
698 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200699 $log->error("Unzip is not installed or incompatible.");
700 exit 1;
Akron81500102017-04-07 20:45:44 +0200701 };
702
703 # Add further annotation archived
704 $archive->attach($_) foreach @input[1..$#input];
705
706 # Create a temporary directory
707 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200708 $extract_dir = tempdir(CLEANUP => 0);
709 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200710 };
711
Akron63f20d42017-04-10 23:40:29 +0200712 # Add some random extra to avoid clashes with multiple archives
713 $extract_dir = catdir($extract_dir, random_string('cccccc'));
714
Akron31a08cb2019-02-20 20:43:26 +0100715 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200716 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200717 @input = ($extract_dir);
718 }
719 else {
720 $log->error('Unable to extract from primary archive ' . $input[0] .
721 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200722 exit 1;
Akron81500102017-04-07 20:45:44 +0200723 };
724 }
725
726 # Can't create archive object
727 else {
728 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200729 exit 1;
Akron81500102017-04-07 20:45:44 +0200730 };
731 };
732
Akron7d4cdd82016-08-17 21:39:45 +0200733 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100734 my $pool = Parallel::ForkManager->new($jobs);
735
Akron7d4cdd82016-08-17 21:39:45 +0200736 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100737 my $iter = 1; # Current text in process
738
Akronda3097e2017-04-23 19:53:57 +0200739 my $tar_archive;
740 my $output_dir = $output;
741 my $tar_fh;
742
743 # Initialize tar archive
744 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200745
746 # Set output name
747 my $tar_file = $output;
748 unless ($tar_file =~ /\.tar$/) {
749 $tar_file .= '.tar';
750 };
751
752 # Initiate the tar file
753 print "Writing to file $tar_file\n";
754 $tar_fh = IO::File->new($tar_file, 'w');
755 $tar_fh->binmode(1);
756
Akroneb370a02022-02-24 13:33:40 +0100757 # Use tar builder for archiving
758 if (eval("use Archive::Tar::Builder; 1;")) {
759 $tar_archive = Archive::Tar::Builder->new(
760 ignore_errors => 1
761 );
762
763 # Set handle
764 $tar_archive->set_handle($tar_fh);
765 }
766
767 # Fallback solution
768 else {
769 $tar_archive = KorAP::XML::TarBuilder->new(
770 $tar_fh
771 );
772 };
Akronda3097e2017-04-23 19:53:57 +0200773
774 # Output to temporary directory
775 $output_dir = File::Temp->newdir;
776 };
777
Akron941c1a62016-02-23 17:41:41 +0100778 # Report on fork message
779 $pool->run_on_finish (
780 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200781 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100782 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200783
Akron08385f62016-03-22 20:37:04 +0100784 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200785 ($iter++) . "/$count]" .
786 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200787 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200788
789 if (!$code && $to_tar && $data->[2]) {
790 my $filename = $data->[2];
791
792 # Lock filehandle
793 if (flock($tar_fh, LOCK_EX)) {
794
Akron9a062ce2017-07-04 19:12:05 +0200795 my $clean_file = fileparse($filename);
796
Akronda3097e2017-04-23 19:53:57 +0200797 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200798 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200799 unlink $filename;
800
801 # Unlock filehandle
802 flock($tar_fh, LOCK_UN);
803 }
804 else {
805 $log->warn("Unable to add $filename to archive");
806 };
807 };
808
Akron4c0cf312016-10-15 16:42:09 +0200809 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100810 }
811 );
812
813 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200814 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100815 print "Reading data ...\n";
816
Akron7d4cdd82016-08-17 21:39:45 +0200817 # unless (Cache::FastMmap->new(
818 # share_file => $cache_file,
819 # cache_size => $cache_size,
820 # init_file => $cache_init
821 # )) {
822 # print "Unable to intialize cache '$cache_file'\n\n";
823 # exit(1);
824 # };
Akron11c80302016-03-18 19:44:43 +0100825
Akron486f9ab2017-04-22 23:25:19 +0200826
Akron941c1a62016-02-23 17:41:41 +0100827 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100828 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200829 # TODO:
830 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100831 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100832 my @dirs;
833 my $dir;
834
Akron7d4cdd82016-08-17 21:39:45 +0200835 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100836 while (1) {
837 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200838 push @dirs, $dir;
839 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100840 };
841 last unless $it->next;
842 };
843
844 print "Start processing ...\n";
845 $t = Benchmark->new;
846 $count = scalar @dirs;
847
848 DIRECTORY_LOOP:
849 for (my $i = 0; $i < $count; $i++) {
850
Akrone1dbc382016-07-08 22:24:52 +0200851 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200852 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200853 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200854 );
Akron941c1a62016-02-23 17:41:41 +0100855
856 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200857 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200858
Akron13d56622016-10-31 14:54:49 +0100859 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200860 $pool->finish(
861 0,
Akronda3097e2017-04-23 19:53:57 +0200862 [
863 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
864 undef,
865 $filename
866 ]
Akron486f9ab2017-04-22 23:25:19 +0200867 );
Akron3ec48972016-08-17 23:24:52 +0200868 }
869 else {
Akron4c0cf312016-10-15 16:42:09 +0200870 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200871 };
Akron941c1a62016-02-23 17:41:41 +0100872 };
873 }
874
875 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200876 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200877
Akron941c1a62016-02-23 17:41:41 +0100878 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200879 $log->error("Unzip is not installed or incompatible.");
880 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100881 };
882
Akron08385f62016-03-22 20:37:04 +0100883 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200884 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100885
Akron31a08cb2019-02-20 20:43:26 +0100886 # Get sigles to extract
887 my $prefix = set_sigle($archive);
888
Akron941c1a62016-02-23 17:41:41 +0100889 print "Start processing ...\n";
890 $t = Benchmark->new;
891 my @dirs = $archive->list_texts;
892 $count = scalar @dirs;
893
894 ARCHIVE_LOOP:
895 for (my $i = 0; $i < $count; $i++) {
896
897 # Split path information
898 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
899
Akrone1dbc382016-07-08 22:24:52 +0200900 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200901 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200902 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200903 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200904 catfile($corpus, $doc, $text)
905 . '.json' . ($gzip ? '.gz' : '')
906 )
Akrone1dbc382016-07-08 22:24:52 +0200907 );
Akron941c1a62016-02-23 17:41:41 +0100908
909 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200910 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100911
Akron4c0cf312016-10-15 16:42:09 +0200912 # Create temporary file
913 $temp = File::Temp->newdir;
914
Akronbdf434a2016-10-24 17:42:07 +0200915 # TODO: Check if $filename exist at the beginning,
916 # because extraction can be horrible slow!
917
Akron941c1a62016-02-23 17:41:41 +0100918 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100919 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100920
Akron7d4cdd82016-08-17 21:39:45 +0200921 # Create corpus directory
922 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100923
Akron7d4cdd82016-08-17 21:39:45 +0200924 # Temporary directory
925 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100926
Akron7d4cdd82016-08-17 21:39:45 +0200927 # Write file
Akron13d56622016-10-31 14:54:49 +0100928 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200929
Akron4c0cf312016-10-15 16:42:09 +0200930 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100931 $pool->finish(
932 0,
Akronda3097e2017-04-23 19:53:57 +0200933 [
934 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
935 $temp,
936 $filename
937 ]
Akron13d56622016-10-31 14:54:49 +0100938 );
Akron7d4cdd82016-08-17 21:39:45 +0200939 }
940 else {
Akron4c0cf312016-10-15 16:42:09 +0200941 # Delete temporary file
942 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200943 };
Akron941c1a62016-02-23 17:41:41 +0100944 }
Akron7d4cdd82016-08-17 21:39:45 +0200945
946 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100947 else {
Akron4c0cf312016-10-15 16:42:09 +0200948 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100949 };
950 };
951 }
952
953 else {
954 print "Input is neither a directory nor an archive.\n\n";
955 };
956
957 $pool->wait_all_children;
958
Akron11c80302016-03-18 19:44:43 +0100959 # Delete cache file
960 unlink($cache_file) if $cache_delete;
961
Akronda3097e2017-04-23 19:53:57 +0200962 # Close tar filehandle
963 if ($to_tar && $tar_fh) {
964 $tar_archive->finish;
965 $tar_fh->close;
966 print "Wrote to tar archive.\n";
967 };
968
Akron63f20d42017-04-10 23:40:29 +0200969 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100970 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200971};
Akron941c1a62016-02-23 17:41:41 +0100972
Nils Diewald2db9ad02013-10-29 19:26:43 +0000973
Akron31a08cb2019-02-20 20:43:26 +0100974# For an archive, this will create the list
975# of all sigles to process
976sub set_sigle {
977 my $archive = shift;
978
979 my $prefix = 1;
980 my @dirs = ();
981
982 # No sigles given
983 unless (@sigle) {
984
985 # Get files
986 foreach ($archive->list_texts) {
987
988 push @dirs, $_;
989
990 # Split path information
991 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
992
993 # TODO: Make this OS independent
994 push @sigle, join '/', $corpus, $doc, $text;
995 };
996 }
997
998 # Check sigle for doc sigles
999 else {
1000 my @new_sigle;
1001
1002 my $prefix_check = 0;
1003
1004 # Iterate over all sigle
1005 foreach (@sigle) {
1006
1007 # Sigle is a doc sigle
1008 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1009
1010 print "$_ ...";
1011 # Check if a prefix is needed
1012 unless ($prefix_check) {
1013
1014 if ($prefix = $archive->check_prefix) {
1015 print " with prefix ...";
1016 };
1017 $prefix_check = 1;
1018 };
1019
1020 print "\n";
1021
Akron31a08cb2019-02-20 20:43:26 +01001022 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001023 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1024 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001025 );
1026 print "extracted.\n";
1027 }
1028
1029 # Sigle is a text sigle
1030 else {
1031 push @new_sigle, $_;
1032
1033 unless ($prefix_check) {
1034
1035 if ($prefix = $archive->check_prefix) {
1036 print " with prefix ...";
1037 };
1038 $prefix_check = 1;
1039 };
1040 };
1041 };
1042 @sigle = @new_sigle;
1043 };
1044
1045 return $prefix;
1046};
1047
1048
Akron63f20d42017-04-10 23:40:29 +02001049# Cleanup temporary extraction directory
1050if ($extract_dir) {
1051 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001052 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001053};
1054
1055
1056print "\n";
1057
Nils Diewald2db9ad02013-10-29 19:26:43 +00001058__END__
Akron941c1a62016-02-23 17:41:41 +01001059
1060=pod
1061
1062=encoding utf8
1063
1064=head1 NAME
1065
Akron42f48c12020-02-14 13:08:13 +01001066korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001067
1068
1069=head1 SYNOPSIS
1070
Akrona76d8352016-10-27 16:27:32 +02001071 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001072
Akron2fd402b2016-10-27 21:26:48 +02001073
Akron941c1a62016-02-23 17:41:41 +01001074=head1 DESCRIPTION
1075
1076L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1077compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001078The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001079
1080
1081=head1 INSTALLATION
1082
1083The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1084
Akronaf386982016-10-12 00:33:25 +02001085 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001086
Akronc13a1702016-03-15 19:33:14 +01001087In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001088be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001089Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akroneb370a02022-02-24 13:33:40 +01001090Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1091Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001092In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001093
1094=head1 ARGUMENTS
1095
Akrona76d8352016-10-27 16:27:32 +02001096 $ korapxml2krill -z --input <directory> --output <filename>
1097
1098Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001099It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001100
Akron941c1a62016-02-23 17:41:41 +01001101=over 2
1102
1103=item B<archive>
1104
Akron081639e2017-04-21 19:01:39 +02001105 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001106
Akron2fd402b2016-10-27 21:26:48 +02001107Converts an archive of KorAP-XML documents. It expects a directory
1108(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001109
1110=item B<extract>
1111
Akrona76d8352016-10-27 16:27:32 +02001112 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1113
1114Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001115
Akron63f20d42017-04-10 23:40:29 +02001116=item B<serial>
1117
1118 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1119
1120Convert archives sequentially. The inputs are not merged but treated
1121as they are (so they may be premerged or globs).
1122the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001123are created based on the archive name. In case the C<--to-tar> flag is given,
1124the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001125
1126
Akron84b53ad2022-01-14 12:39:15 +01001127=item B<slimlog>
1128
1129 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1130
1131Filters out all useless aka succesfull information from logs, to simplify
1132log checks. Expects no further options.
1133
1134
Akron941c1a62016-02-23 17:41:41 +01001135=back
1136
1137
1138=head1 OPTIONS
1139
1140=over 2
1141
Akrona76d8352016-10-27 16:27:32 +02001142=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001143
Akrona76d8352016-10-27 16:27:32 +02001144Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001145
Akron7606afa2016-10-25 16:23:49 +02001146Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001147document, while C<archive> expects a KorAP-XML corpus folder or a zip
1148file to batch process multiple files.
1149C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001150
Akrona76d8352016-10-27 16:27:32 +02001151C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001152that the first archive listed contains all primary data files
1153and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001154
Akron7606afa2016-10-25 16:23:49 +02001155 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001156
Akron821db3d2017-04-06 21:19:31 +02001157Input may also be defined using BSD glob wildcards.
1158
1159 -i 'file/news*.zip'
1160
1161The extended input array will be sorted in length order, so the shortest
1162path needs to contain all primary data files and all meta data files.
1163
Akron0c3e3752016-06-28 15:55:53 +02001164(The directory structure follows the base directory format,
1165that may include a C<.> root folder.
1166In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001167need to be passed with a hash sign in front of the archive's name.
1168This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001169
Akron7606afa2016-10-25 16:23:49 +02001170To support zip files, a version of C<unzip> needs to be installed that is
1171compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001172
Akron7606afa2016-10-25 16:23:49 +02001173B<The root folder switch using the hash sign is experimental and
1174may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001175
Akronf73ffb62018-06-27 12:13:59 +02001176
Akron63f20d42017-04-10 23:40:29 +02001177=item B<--input-base|-ib> <directory>
1178
1179The base directory for inputs.
1180
1181
Akron941c1a62016-02-23 17:41:41 +01001182=item B<--output|-o> <directory|file>
1183
1184Output folder for archive processing or
1185document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001186writes to C<STDOUT> by default
1187(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001188
1189=item B<--overwrite|-w>
1190
1191Overwrite files that already exist.
1192
Akronf73ffb62018-06-27 12:13:59 +02001193
Akron3741f8b2016-12-21 19:55:21 +01001194=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001195
1196Define the default tokenization by specifying
1197the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001198of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001199This will directly take the file instead of running
1200the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001201
Akron3741f8b2016-12-21 19:55:21 +01001202
1203=item B<--base-sentences|-bs> <foundry>#<layer>
1204
1205Define the layer for base sentences.
1206If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001207Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1208layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001209
1210 Defaults to unset.
1211
1212
1213=item B<--base-paragraphs|-bp> <foundry>#<layer>
1214
1215Define the layer for base paragraphs.
1216If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001217Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1218layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001219
1220 Defaults to unset.
1221
1222
Akron41ac10b2017-02-08 22:47:25 +01001223=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1224
1225Define the layer for base pagebreaks.
1226Currently C<DeReKo#Structure> is the only layer supported.
1227
1228 Defaults to unset.
1229
1230
Akron941c1a62016-02-23 17:41:41 +01001231=item B<--skip|-s> <foundry>[#<layer>]
1232
Akronf7ad89e2016-03-16 18:22:47 +01001233Skip specific annotations by specifying the foundry
1234(and optionally the layer with a C<#>-prefix),
1235e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001236Can be set multiple times.
1237
Akronf73ffb62018-06-27 12:13:59 +02001238
Akronc13a1702016-03-15 19:33:14 +01001239=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001240
Akronf7ad89e2016-03-16 18:22:47 +01001241Convert specific annotations by specifying the foundry
1242(and optionally the layer with a C<#>-prefix),
1243e.g. C<Mate> or C<Mate#Morpho>.
1244Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001245
Akronf73ffb62018-06-27 12:13:59 +02001246
Akroned9baf02019-01-22 17:03:25 +01001247=item B<--non-word-tokens|-nwt>
1248
1249Tokenize non-word tokens like word tokens (defined as matching
1250C</[\d\w]/>). Useful to treat punctuations as tokens.
1251
1252 Defaults to unset.
1253
Akronf1849aa2019-12-16 23:35:33 +01001254
1255=item B<--non-verbal-tokens|-nvt>
1256
1257Tokenize non-verbal tokens marked as in the primary data as
1258the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1259
1260 Defaults to unset.
1261
1262
Akron941c1a62016-02-23 17:41:41 +01001263=item B<--jobs|-j>
1264
1265Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001266for archive processing.
Akron11c80302016-03-18 19:44:43 +01001267Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001268
1269If C<sequential-extraction> is not set to false, this will
1270also apply to extraction.
1271
Akronc11f7982017-02-21 21:20:14 +01001272Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001273times the number of available cores, in case L<Sys::Info>
1274is available.
Akronf7ad89e2016-03-16 18:22:47 +01001275This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001276
Akronf73ffb62018-06-27 12:13:59 +02001277
Akron263274c2019-02-07 09:48:30 +01001278=item B<--koral|-k>
1279
1280Version of the output format. Supported versions are:
1281C<0> for legacy serialization, C<0.03> for serialization
1282with metadata fields as key-values on the root object,
1283C<0.4> for serialization with metadata fields as a list
1284of C<"@type":"koral:field"> objects.
1285
1286Currently defaults to C<0.03>.
1287
1288
Akron9ec88872017-04-12 16:29:06 +02001289=item B<--sequential-extraction|-se>
1290
1291Flag to indicate, if the C<jobs> value also applies to extraction.
1292Some systems may have problems with extracting multiple archives
1293to the same folder at the same time.
1294Can be flagged using C<--no-sequential-extraction> as well.
1295Defaults to C<false>.
1296
Akronf73ffb62018-06-27 12:13:59 +02001297
Akron35db6e32016-03-17 22:42:22 +01001298=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001299
Akron35db6e32016-03-17 22:42:22 +01001300Define the metadata parser to use. Defaults to C<I5>.
1301Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1302This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001303
Akronf73ffb62018-06-27 12:13:59 +02001304
Akron941c1a62016-02-23 17:41:41 +01001305=item B<--gzip|-z>
1306
Akronf7ad89e2016-03-16 18:22:47 +01001307Compress the output.
1308Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001309
Akronf73ffb62018-06-27 12:13:59 +02001310
Akron11c80302016-03-18 19:44:43 +01001311=item B<--cache|-c>
1312
1313File to mmap a cache (using L<Cache::FastMmap>).
1314Defaults to C<korapxml2krill.cache> in the calling directory.
1315
Akronf73ffb62018-06-27 12:13:59 +02001316
Akron11c80302016-03-18 19:44:43 +01001317=item B<--cache-size|-cs>
1318
1319Size of the cache. Defaults to C<50m>.
1320
Akronf73ffb62018-06-27 12:13:59 +02001321
Akron11c80302016-03-18 19:44:43 +01001322=item B<--cache-init|-ci>
1323
1324Initialize cache file.
1325Can be flagged using C<--no-cache-init> as well.
1326Defaults to C<true>.
1327
Akronf73ffb62018-06-27 12:13:59 +02001328
Akron11c80302016-03-18 19:44:43 +01001329=item B<--cache-delete|-cd>
1330
1331Delete cache file after processing.
1332Can be flagged using C<--no-cache-delete> as well.
1333Defaults to C<true>.
1334
Akronf73ffb62018-06-27 12:13:59 +02001335
Akron636aa112017-04-07 18:48:56 +02001336=item B<--config|-cfg>
1337
1338Configure the parameters of your call in a file
1339of key-value pairs with whitespace separator
1340
1341 overwrite 1
1342 token DeReKo#Structure
1343 ...
1344
1345Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001346C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001347C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001348C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001349C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001350C<base-sentences>, C<base-paragraphs>,
1351C<base-pagebreaks>,
1352C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001353(semicolon separated), C<anno> (semicolon separated).
1354
Akronf73ffb62018-06-27 12:13:59 +02001355Configuration parameters will always be overwritten by
1356passed parameters.
1357
1358
Akron81500102017-04-07 20:45:44 +02001359=item B<--temporary-extract|-te>
1360
1361Only valid for the C<archive> command.
1362
1363This will first extract all files into a
1364directory and then will archive.
1365If the directory is given as C<:temp:>,
1366a temporary directory is used.
1367This is especially useful to avoid
1368massive unzipping and potential
1369network latency.
Akron636aa112017-04-07 18:48:56 +02001370
Akronf73ffb62018-06-27 12:13:59 +02001371
Akronc93a0802019-07-11 15:48:34 +02001372=item B<--to-tar>
1373
1374Only valid for the C<archive> command.
1375
1376Writes the output into a tar archive.
1377
1378
Akrone10ad322016-02-27 10:54:26 +01001379=item B<--sigle|-sg>
1380
Akron20807582016-10-26 17:11:34 +02001381Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001382Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001383I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001384Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001385In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001386On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001387
Akron64f7fae2022-07-27 12:45:33 +02001388=item B<--lang>
1389
1390Preferred language for metadata fields. In case multiple titles are
1391given (on any level) with different C<xml:lang> attributes,
1392the language given is preferred.
1393Because titles may have different sources and different priorities,
1394non-specific language titles may still be preferred in case the title
1395source has a higher priority.
1396
Akronf73ffb62018-06-27 12:13:59 +02001397
Akron941c1a62016-02-23 17:41:41 +01001398=item B<--log|-l>
1399
Akronb9c33812020-10-21 16:19:35 +02001400The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001401
Akronf73ffb62018-06-27 12:13:59 +02001402
Akron941c1a62016-02-23 17:41:41 +01001403=item B<--help|-h>
1404
Akron42f48c12020-02-14 13:08:13 +01001405Print help information.
Akron941c1a62016-02-23 17:41:41 +01001406
Akronf73ffb62018-06-27 12:13:59 +02001407
Akron941c1a62016-02-23 17:41:41 +01001408=item B<--version|-v>
1409
1410Print version information.
1411
1412=back
1413
Akronf73ffb62018-06-27 12:13:59 +02001414
Akronc13a1702016-03-15 19:33:14 +01001415=head1 ANNOTATION SUPPORT
1416
1417L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1418developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1419The base foundry with paragraphs, sentences, and the text element are mandatory for
1420L<Krill|https://github.com/KorAP/Krill>.
1421
Akron821db3d2017-04-06 21:19:31 +02001422 Base
1423 #Paragraphs
1424 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001425
Akron821db3d2017-04-06 21:19:31 +02001426 Connexor
1427 #Morpho
1428 #Phrase
1429 #Sentences
1430 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001431
Akron821db3d2017-04-06 21:19:31 +02001432 CoreNLP
1433 #Constituency
1434 #Morpho
1435 #NamedEntities
1436 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001437
Akronce125b62017-06-19 11:54:36 +02001438 CMC
1439 #Morpho
1440
Akron821db3d2017-04-06 21:19:31 +02001441 DeReKo
1442 #Structure
Akronc13a1702016-03-15 19:33:14 +01001443
Akron57510c12019-01-04 14:58:53 +01001444 DGD
1445 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001446 #Structure
Akron57510c12019-01-04 14:58:53 +01001447
Akron821db3d2017-04-06 21:19:31 +02001448 DRuKoLa
1449 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001450
Akron821db3d2017-04-06 21:19:31 +02001451 Glemm
1452 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001453
Akronabb36902021-10-11 15:51:06 +02001454 Gingko
1455 #Morpho
1456
Akronea1aed52018-07-19 14:43:34 +02001457 HNC
1458 #Morpho
1459
Akron4c679192018-01-16 17:41:49 +01001460 LWC
1461 #Dependency
1462
Akron821db3d2017-04-06 21:19:31 +02001463 Malt
1464 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001465
Akron821db3d2017-04-06 21:19:31 +02001466 MarMoT
1467 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001468
Akron821db3d2017-04-06 21:19:31 +02001469 Mate
1470 #Dependency
1471 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001472
Akron821db3d2017-04-06 21:19:31 +02001473 MDParser
1474 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001475
Akrone85a7762022-07-22 08:05:03 +02001476 NKJP
1477 #Morpho
1478 #NamedEntities
1479
Akron821db3d2017-04-06 21:19:31 +02001480 OpenNLP
1481 #Morpho
1482 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001483
Akron07e24772020-04-23 14:00:54 +02001484 RWK
1485 #Morpho
1486 #Structure
1487
Akron821db3d2017-04-06 21:19:31 +02001488 Sgbr
1489 #Lemma
1490 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001491
Akron7d5e6382019-08-08 16:36:27 +02001492 Talismane
1493 #Dependency
1494 #Morpho
1495
Akron821db3d2017-04-06 21:19:31 +02001496 TreeTagger
1497 #Morpho
1498 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001499
Akron821db3d2017-04-06 21:19:31 +02001500 XIP
1501 #Constituency
1502 #Morpho
1503 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001504
Akronc13a1702016-03-15 19:33:14 +01001505
1506More importers are in preparation.
1507New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1508See the built-in annotation importers as examples.
1509
Akronf73ffb62018-06-27 12:13:59 +02001510
Akron41e6c8b2021-10-14 20:22:18 +02001511=head1 METADATA SUPPORT
1512
1513L<KorAP::XML::Krill> has built-in importer for some meta data variants
1514developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1515
1516=over 2
1517
1518=item I5 - Meta data for all I5 files
1519
1520=item Sgbr - Meta data from the Schreibgebrauch project
1521
1522=item Gingko - Meta data from the Gingko project in addition to I5
1523
1524=back
1525
1526More importers are in preparation.
1527New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1528See the built-in meta data importers as examples.
1529
1530
Akron8f69d632020-01-15 16:58:11 +01001531=head1 About KorAP-XML
1532
1533KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1534data model (Bański et al. 2013), where text data are stored physically
1535separated from their interpretations (i.e. annotations).
1536A text document in KorAP-XML therefore consists of several files
1537containing primary data, metadata and annotations.
1538
1539The structure of a single KorAP-XML document can be as follows:
1540
1541 - data.xml
1542 - header.xml
1543 + base
1544 - tokens.xml
1545 - ...
1546 + struct
1547 - structure.xml
1548 - ...
1549 + corenlp
1550 - morpho.xml
1551 - constituency.xml
1552 - ...
1553 + tree_tagger
1554 - morpho.xml
1555 - ...
1556 - ...
1557
1558The C<data.xml> contains the primary data, the C<header.xml> contains
1559the metadata, and the annotation layers are stored in subfolders
1560like C<base>, C<struct> or C<corenlp>
1561(so-called "foundries"; Bański et al. 2013).
1562
1563Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001564(Lüngen and Sperberg-McQueen 2012). See the documentation in
1565L<KorAP::XML::Meta::I5> for translatable fields.
1566
1567Annotations correspond to a variant of the TEI-P5 feature structures
1568(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001569Annotation feature structures refer to character sequences of the primary text
1570inside the C<text> element of the C<data.xml>.
1571A single annotation containing the lemma of a token can have the following structure:
1572
1573 <span from="0" to="3">
1574 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1575 <f name="lex">
1576 <fs>
1577 <f name="lemma">zum</f>
1578 </fs>
1579 </f>
1580 </fs>
1581 </span>
1582
1583The C<from> and C<to> attributes are refering to the character span
1584in the primary text.
1585Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1586the structure may vary. See L<KorAP::XML::Annotation::*> for various
1587annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001588
1589Multiple KorAP-XML documents are organized on three levels following
1590the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1591corpus E<gt> document E<gt> text. On each level metadata information
1592can be stored, that C<korapxml2krill> will merge to a single metadata
1593object per text. A corpus is therefore structured as follows:
1594
1595 + <corpus>
1596 - header.xml
1597 + <document>
1598 - header.xml
1599 + <text>
1600 - data.xml
1601 - header.xml
1602 - ...
1603 - ...
1604
1605A single text can be identified by the concatenation of
1606the corpus identifier, the document identifier and the text identifier.
1607This identifier is called the text sigle
1608(e.g. a text with the identifier C<18486> in the document C<060> in the
1609corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1610
1611These corpora are often stored in zip files, with which C<korapxml2krill>
1612can deal with. Corpora may also be split in multiple zip archives
1613(e.g. one zip file per foundry), which is also supported (see C<--input>).
1614
1615Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1616in form of a test suite.
1617The resulting JSON format merges all annotation layers
1618based on a single token stream.
1619
1620=head2 References
1621
1622Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1623KorAP data model: first approximation, December.
1624
1625Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1626"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1627Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1628L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1629
1630Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1631"Robust corpus architecture: a new look at virtual collections and data access",
1632Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1633L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1634
1635Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1636Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1637"Towards an international standard on featurestructure representation",
1638Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1639pp. 373-376.
1640L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1641
1642Harald Lüngen and C. M. Sperberg-McQueen (2012):
1643"A TEI P5 Document Grammar for the IDS Text Model",
1644Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1645L<PDF|https://journals.openedition.org/jtei/pdf/508>
1646
1647TEI Consortium, eds:
1648"Feature Structures",
1649Guidelines for Electronic Text Encoding and Interchange.
1650L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1651
Akron941c1a62016-02-23 17:41:41 +01001652=head1 AVAILABILITY
1653
1654 https://github.com/KorAP/KorAP-XML-Krill
1655
1656
1657=head1 COPYRIGHT AND LICENSE
1658
Akron9a2545e2022-01-16 15:15:50 +01001659Copyright (C) 2015-2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001660
Akron6882d7d2021-02-08 09:43:57 +01001661Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001662
Akrona76d8352016-10-27 16:27:32 +02001663Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001664
Akron6882d7d2021-02-08 09:43:57 +01001665L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001666Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001667L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001668member of the
Akronf1849aa2019-12-16 23:35:33 +01001669L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001670
1671This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001672L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001673
1674=cut