blob: 75dd21d53970d3cc9915b1ae3f2dff7b00e87efe [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron83aedd32023-02-07 10:57:41 +0100167#
168# 2023/02/05
169# - Support for UD
Akron941c1a62016-02-23 17:41:41 +0100170# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100171
Akron83aedd32023-02-07 10:57:41 +0100172our $LAST_CHANGE = '2023/02/05';
Akron941c1a62016-02-23 17:41:41 +0100173our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100174our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100175our $VERSION_MSG = <<"VERSION";
176Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
177VERSION
178
Akron941c1a62016-02-23 17:41:41 +0100179# Parse comand
180my $cmd;
181our @ARGV;
182if ($ARGV[0] && index($ARGV[0], '-') != 0) {
183 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100184};
Akron63f20d42017-04-10 23:40:29 +0200185my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100186
Akron5f51d422016-08-16 16:26:43 +0200187my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200188
189# Configuration hash
190my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100191
Akron941c1a62016-02-23 17:41:41 +0100192# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000193GetOptions(
Akron08385f62016-03-22 20:37:04 +0100194 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200195 'input-base|ib=s' => \($cfg{input_base}),
196 'output|o=s' => \($cfg{output}),
197 'overwrite|w' => \($cfg{overwrite}),
198 'meta|m=s' => \($cfg{meta}),
199 'token|t=s' => \($cfg{token}),
200 'base-sentences|bs=s' => \($cfg{base_sentences}),
201 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
202 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
203 'gzip|z' => \($cfg{gzip}),
204 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100205 'skip|s=s' => \@skip,
206 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200207 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200208 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200209 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200210 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200211 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200212 'primary|p!' => sub {
213 warn 'Primary flag no longer supported!';
214 },
Akron6aed0562020-08-07 16:46:00 +0200215 'pretty|y' => sub {
216 warn 'Pretty flag no longer supported!';
217 },
Akronf8df2162020-08-07 15:03:39 +0200218 'jobs|j=i' => \($cfg{jobs}),
219 'koral|k=f' => \($cfg{koral}),
220 'to-tar' => \($cfg{to_tar}),
221 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
222 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
223 'sequential-extraction|se' => \($cfg{sequential_extraction}),
224 'cache-size|cs=s' => \($cfg{cache_size}),
225 'cache-delete|cd!' => \($cfg{cache_delete}),
226 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100227 'help|h' => sub {
228 pod2usage(
229 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200230 -verbose => 99,
231 -msg => $VERSION_MSG,
232 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100233 );
234 },
235 'version|v' => sub {
236 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200237 -verbose => 0,
238 -msg => $VERSION_MSG,
239 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100240 )
241 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000242);
243
Akrone512b7c2020-08-07 16:16:12 +0200244my %ERROR_HASH = (
245 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
246 -verbose => 99,
247 -msg => $VERSION_MSG,
248 -output => '-',
249 -exit => 1
250);
Akron63f20d42017-04-10 23:40:29 +0200251
Akronf8df2162020-08-07 15:03:39 +0200252# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200253if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200254 my %config;
255
Akronf8df2162020-08-07 15:03:39 +0200256 print "Reading config from $cfg_file\n";
257
Akron636aa112017-04-07 18:48:56 +0200258 Config::Simple->import_from($cfg_file, \%config);
259
Akronf8df2162020-08-07 15:03:39 +0200260 foreach (qw!output cache-size input-base token overwrite
261 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200262 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100263 non-verbal-tokens sequential-extraction
264 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200265 koral extract-dir jobs!) {
266 my $underlined = $_ =~ tr/-/_/r;
267 if (!defined($cfg{$underlined}) && defined $config{$_}) {
268 $cfg{$underlined} = $config{$_};
269 };
Akron636aa112017-04-07 18:48:56 +0200270 };
271
272 # Skip
273 if (!scalar(@skip) && defined $config{'skip'}) {
274 @skip = split /\s*;\s*/, $config{'skip'} ;
275 };
276
277 # Sigle
278 if (!scalar(@sigle) && defined $config{'sigle'}) {
279 @sigle = split /\s*;\s*/, $config{'sigle'} ;
280 };
281
282 # Anno
283 if (!scalar(@anno) && defined $config{'anno'}) {
284 @anno = split /\s*;\s*/, $config{'anno'} ;
285 };
286};
287
Akronf8df2162020-08-07 15:03:39 +0200288# Init variables and set default values
289my $output = $cfg{output};
290my $input_base = $cfg{input_base};
291my $gzip = $cfg{gzip};
292my $to_tar = $cfg{to_tar};
293my $extract_dir = $cfg{extract_dir};
294my $token_base = $cfg{token} // 'OpenNLP#tokens';
295my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
296my $jobs = $cfg{jobs} // 0;
297my $cache_delete = $cfg{cache_delete} // 1;
298my $base_sentences = lc($cfg{base_sentences} // '');
299my $base_paragraphs = lc($cfg{base_paragraphs} // '');
300my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
301my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200302
Akronf8df2162020-08-07 15:03:39 +0200303# Get tokenization basis
304my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200305
Akronf8df2162020-08-07 15:03:39 +0200306# Remove file extension
307$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100308
Akronf8df2162020-08-07 15:03:39 +0200309# Convert sigle to path construct
310s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
311
312my %skip;
313$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200314
Akronb9c33812020-10-21 16:19:35 +0200315Log::Any::Adapter->set(
316 'Stderr', log_level => uc($cfg{log} // 'ERROR')
317);
Akron63f20d42017-04-10 23:40:29 +0200318
Akron84b53ad2022-01-14 12:39:15 +0100319# Start log slimming
320if ($cmd && $cmd eq 'slimlog') {
321 require KorAP::XML::Log::Slim;
322
323 my $log_file = shift @ARGV;
324
325 if (-e $log_file) {
326
327 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
328
329 # Run log filter
330 $slimmer->slim_to;
331 }
332
333 else {
334 warn "Log file can't be found";
335 exit(1);
336 };
337
338 exit;
339};
340
341
Akronf8df2162020-08-07 15:03:39 +0200342if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
343 $log->error("Directory '$output' does not exist.");
344 exit 1;
345};
Akron63f20d42017-04-10 23:40:29 +0200346
Akron941c1a62016-02-23 17:41:41 +0100347# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100348pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349
Akrone1dbc382016-07-08 22:24:52 +0200350# Gzip has no effect, if no output is given
351pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352
Akron63f20d42017-04-10 23:40:29 +0200353# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200354if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200355
Akron63f20d42017-04-10 23:40:29 +0200356 # Remove all inputs
357 my $remove_next = 0;
358 @keep_argv = @{c(@keep_argv)->grep(
359 sub {
360 # Input flag
361 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
362 $remove_next = 1;
363 return 0;
364 }
365
366 # input value
367 elsif ($remove_next) {
368 $remove_next = 0;
369 return 0;
370 };
371
372 # Pass parameter
373 return 1;
374 }
375 )->to_array};
376
377
378 # Iterate over all inputs
379 foreach (@input) {
380
Akron081639e2017-04-21 19:01:39 +0200381 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200382 my $new_out = catdir($output, get_file_name_from_glob($_));
383
Akron486f9ab2017-04-22 23:25:19 +0200384 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200385 unless ($to_tar) {
386 if (make_path($new_out) == 0 && !-d $new_out) {
387 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200388 exit 1;
Akron081639e2017-04-21 19:01:39 +0200389 };
Akron63f20d42017-04-10 23:40:29 +0200390 };
391
392 # Create archive command
393 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
394 print "Start serial processing of $_ to $new_out\n";
395
396 # Start archiving
397 system @archive_cmd;
398 };
399
Akron3abc03e2017-06-29 16:23:35 +0200400 exit;
Akron63f20d42017-04-10 23:40:29 +0200401};
402
Akron5c602cb2020-08-07 17:00:52 +0200403# Define supported (and preinstalled) transformation modules
404my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100405push(@layers, ['Base', 'Sentences']) unless $base_sentences;
406push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200407
408# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200409push(@layers, ['Connexor', 'Morpho'],
410 ['Connexor', 'Syntax'],
411 ['Connexor', 'Phrase'],
412 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200413
414# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200415push(@layers,
416 ['CoreNLP', 'NamedEntities'],
417 ['CoreNLP', 'Sentences'],
418 ['CoreNLP', 'Morpho'],
419 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200420
Akronce125b62017-06-19 11:54:36 +0200421# CMC
422push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100423
Akrone1dbc382016-07-08 22:24:52 +0200424# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100425my @dereko_attr = ();
426if ($base_sentences eq 'dereko#structure') {
427 push @dereko_attr, 'sentences';
428};
429if ($base_paragraphs eq 'dereko#structure') {
430 push @dereko_attr, 'paragraphs';
431};
Akron636bd9c2017-02-09 17:13:00 +0100432
Akron41ac10b2017-02-08 22:47:25 +0100433if ($base_pagebreaks eq 'dereko#structure') {
434 push @dereko_attr, 'pagebreaks';
435};
436
437if ($dereko_attr[0]) {
438 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100439}
440else {
441 push(@layers, ['DeReKo', 'Structure']);
442};
Akrone1dbc382016-07-08 22:24:52 +0200443
Akron57510c12019-01-04 14:58:53 +0100444# DGD
445push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100446if ($base_sentences eq 'dgd#structure') {
447 push(@layers, ['DGD', 'Structure', 'base-sentence']);
448}
Akron57510c12019-01-04 14:58:53 +0100449
450# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200451push(@layers,
452 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100453
Akronabb36902021-10-11 15:51:06 +0200454# Gingko
455push(@layers,
456 ['Gingko', 'Morpho']);
457
Akrone1dbc382016-07-08 22:24:52 +0200458# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200459push(@layers,
460 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200461
Akronea1aed52018-07-19 14:43:34 +0200462# HNC
Akron5c602cb2020-08-07 17:00:52 +0200463push(@layers,
464 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200465
Akron4c679192018-01-16 17:41:49 +0100466# LWC
Akron5c602cb2020-08-07 17:00:52 +0200467push(@layers,
468 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100469
Akrone1dbc382016-07-08 22:24:52 +0200470# Malt
Akron5c602cb2020-08-07 17:00:52 +0200471push(@layers,
472 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200473
Akron57510c12019-01-04 14:58:53 +0100474# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200475push(@layers,
476 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200477
478# Mate
Akron5c602cb2020-08-07 17:00:52 +0200479push(@layers,
480 ['Mate', 'Morpho'],
481 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200482
Akron57510c12019-01-04 14:58:53 +0100483# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200484push(@layers,
485 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100486
Akron88d063a2022-03-21 15:10:01 +0100487# NKJP
488push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200489 ['NKJP', 'Morpho'],
490 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100491
Akrone1dbc382016-07-08 22:24:52 +0200492# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200493push(@layers,
494 ['OpenNLP', 'Morpho'],
495 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200496
Akron07e24772020-04-23 14:00:54 +0200497# Redewiedergabe
498push(@layers, ['RWK', 'Morpho']);
499if ($base_sentences eq 'rwk#structure') {
500 push(@layers, ['RWK', 'Structure']);
501};
502
Akrone1dbc382016-07-08 22:24:52 +0200503# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200504push(@layers,
505 ['Sgbr', 'Lemma'],
506 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200507
Akron7d5e6382019-08-08 16:36:27 +0200508# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200509push(@layers,
510 ['Talismane', 'Dependency'],
511 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200512
Akrone1dbc382016-07-08 22:24:52 +0200513# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200514push(@layers,
515 ['TreeTagger', 'Morpho'],
516 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200517
Marc Kupietz400590b2022-12-23 16:02:36 +0100518# UDPipe
519push(@layers,
520 ['UDPipe', 'Morpho'],
521 ['UDPipe', 'Dependency']);
522
Akrone1dbc382016-07-08 22:24:52 +0200523# XIP
Akron5c602cb2020-08-07 17:00:52 +0200524push(@layers,
525 ['XIP', 'Morpho'],
526 ['XIP', 'Constituency'],
527 ['XIP', 'Sentences'],
528 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200529
Akron4fa37c32017-01-20 14:43:10 +0100530
Akrone1dbc382016-07-08 22:24:52 +0200531# Check filters
532my @filtered_anno;
533if ($skip{'#all'}) {
534 foreach (@anno) {
535 push @filtered_anno, [ split('#', $_) ];
536 };
537}
538
539# Add all annotations that are not skipped
540else {
541 # Add to index file - respect skipping
542 foreach my $info (@layers) {
543 # Skip if Foundry or Foundry#Layer should be skipped
544 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
545 push @filtered_anno, $info;
546 };
547 };
548};
549
Akrone1dbc382016-07-08 22:24:52 +0200550
551# TODO: This should not be initialized for batch
552my $cache = Cache::FastMmap->new(
553 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200554 cache_size => ($cfg{cache_size} // '50m'),
555 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200556);
557
Akron03b24db2016-08-16 20:54:32 +0200558# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200559my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200560 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200561 meta_type => $cfg{meta},
562 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200563 foundry => $token_base_foundry,
564 layer => $token_base_layer,
565 gzip => $gzip,
566 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200567 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100568 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200569 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200570 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
571 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200572);
573
Akrone512b7c2020-08-07 16:16:12 +0200574# Auto adjust jobs
575if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100576 my $cores = 1;
577 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
578 $cores = Sys::Info->new->device('CPU')->count;
579 }
580 else {
581 $log->warn("Unable to determine number of cores");
582 };
583
Akrone512b7c2020-08-07 16:16:12 +0200584 $jobs = ceil(5 * $cores);
585 $log->info("Run using $jobs jobs on $cores cores");
586};
587
588
Akron63f20d42017-04-10 23:40:29 +0200589# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200590if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200591
Akron821db3d2017-04-06 21:19:31 +0200592 my @new_input = ();
593
594 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200595 foreach my $wild_card (@input) {
596
597 # Prefix with input root
598 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
599
600 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200601 };
602
Akron63f20d42017-04-10 23:40:29 +0200603 # Sort files by length
604 @input = sort { length($a) <=> length($b) } @new_input;
605
606 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200607};
608
609
Akron941c1a62016-02-23 17:41:41 +0100610# Process a single file
611unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100612 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000613
Akron941c1a62016-02-23 17:41:41 +0100614 BEGIN {
615 $main::TIME = Benchmark->new;
616 $main::LAST_STOP = Benchmark->new;
617 };
618
619 sub stop_time {
620 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200621 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100622 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200623 timestr(timediff($new, $main::LAST_STOP)) .
624 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
625 );
Akron941c1a62016-02-23 17:41:41 +0100626 $main::LAST_STOP = $new;
627 };
628
629 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200630 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100631
Akron7d4cdd82016-08-17 21:39:45 +0200632 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200633 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100634
Akron11c80302016-03-18 19:44:43 +0100635 # Delete cache file
636 unlink($cache_file) if $cache_delete;
637
Akron5f51d422016-08-16 16:26:43 +0200638 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200639 exit;
Akron81500102017-04-07 20:45:44 +0200640};
641
Nils Diewald59094f22014-11-05 18:20:50 +0000642
Akrone10ad322016-02-27 10:54:26 +0100643# Extract XML files
Akron81500102017-04-07 20:45:44 +0200644if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100645
Akrond5643ad2017-07-04 20:27:13 +0200646 # Output is required
647 pod2usage(%ERROR_HASH) unless $output;
648
Akron7d4cdd82016-08-17 21:39:45 +0200649 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200650 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100651
Akron7d4cdd82016-08-17 21:39:45 +0200652 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100653 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200654 $log->error("Unzip is not installed or incompatible.");
655 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100656 };
657
Akronb0c88db2016-06-29 16:33:18 +0200658 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200659 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200660
Akron31a08cb2019-02-20 20:43:26 +0100661 # Will set @sigle
662 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200663
Akrone10ad322016-02-27 10:54:26 +0100664 # Iterate over all given sigles and extract
665 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100666
Akron2812ba22016-10-28 21:55:59 +0200667 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200668
Akron03b24db2016-08-16 20:54:32 +0200669 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200670 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100671
Akron955b75b2019-02-21 14:28:41 +0100672 # TODO:
673 # - prefix???
674 $archive->extract_sigle([$_], $output, $jobs)
675 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200676 );
Akrone10ad322016-02-27 10:54:26 +0100677 print "extracted.\n";
678 };
Akronb0c88db2016-06-29 16:33:18 +0200679 }
Akron7d4cdd82016-08-17 21:39:45 +0200680
681 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200682 else {
683 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200684 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100685 };
686}
687
Akron81500102017-04-07 20:45:44 +0200688
Akron941c1a62016-02-23 17:41:41 +0100689# Process an archive
690elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000691
Akron81500102017-04-07 20:45:44 +0200692 my $archive_output;
693
694 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100695 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200696
697 # Create new archive object
698 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
699
700 # Check zip capabilities
701 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200702 $log->error("Unzip is not installed or incompatible.");
703 exit 1;
Akron81500102017-04-07 20:45:44 +0200704 };
705
706 # Add further annotation archived
707 $archive->attach($_) foreach @input[1..$#input];
708
709 # Create a temporary directory
710 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200711 $extract_dir = tempdir(CLEANUP => 0);
712 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200713 };
714
Akron63f20d42017-04-10 23:40:29 +0200715 # Add some random extra to avoid clashes with multiple archives
716 $extract_dir = catdir($extract_dir, random_string('cccccc'));
717
Akron31a08cb2019-02-20 20:43:26 +0100718 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200719 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200720 @input = ($extract_dir);
721 }
722 else {
723 $log->error('Unable to extract from primary archive ' . $input[0] .
724 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200725 exit 1;
Akron81500102017-04-07 20:45:44 +0200726 };
727 }
728
729 # Can't create archive object
730 else {
731 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200732 exit 1;
Akron81500102017-04-07 20:45:44 +0200733 };
734 };
735
Akron7d4cdd82016-08-17 21:39:45 +0200736 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100737 my $pool = Parallel::ForkManager->new($jobs);
738
Akron7d4cdd82016-08-17 21:39:45 +0200739 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100740 my $iter = 1; # Current text in process
741
Akronda3097e2017-04-23 19:53:57 +0200742 my $tar_archive;
743 my $output_dir = $output;
744 my $tar_fh;
745
746 # Initialize tar archive
747 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200748
749 # Set output name
750 my $tar_file = $output;
751 unless ($tar_file =~ /\.tar$/) {
752 $tar_file .= '.tar';
753 };
754
755 # Initiate the tar file
756 print "Writing to file $tar_file\n";
757 $tar_fh = IO::File->new($tar_file, 'w');
758 $tar_fh->binmode(1);
759
Akroneb370a02022-02-24 13:33:40 +0100760 # Use tar builder for archiving
761 if (eval("use Archive::Tar::Builder; 1;")) {
762 $tar_archive = Archive::Tar::Builder->new(
763 ignore_errors => 1
764 );
765
766 # Set handle
767 $tar_archive->set_handle($tar_fh);
768 }
769
770 # Fallback solution
771 else {
772 $tar_archive = KorAP::XML::TarBuilder->new(
773 $tar_fh
774 );
775 };
Akronda3097e2017-04-23 19:53:57 +0200776
777 # Output to temporary directory
778 $output_dir = File::Temp->newdir;
779 };
780
Akron941c1a62016-02-23 17:41:41 +0100781 # Report on fork message
782 $pool->run_on_finish (
783 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200784 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100785 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200786
Akron08385f62016-03-22 20:37:04 +0100787 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200788 ($iter++) . "/$count]" .
789 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200790 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200791
792 if (!$code && $to_tar && $data->[2]) {
793 my $filename = $data->[2];
794
795 # Lock filehandle
796 if (flock($tar_fh, LOCK_EX)) {
797
Akron9a062ce2017-07-04 19:12:05 +0200798 my $clean_file = fileparse($filename);
799
Akronda3097e2017-04-23 19:53:57 +0200800 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200801 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200802 unlink $filename;
803
804 # Unlock filehandle
805 flock($tar_fh, LOCK_UN);
806 }
807 else {
808 $log->warn("Unable to add $filename to archive");
809 };
810 };
811
Akron4c0cf312016-10-15 16:42:09 +0200812 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100813 }
814 );
815
816 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200817 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100818 print "Reading data ...\n";
819
Akron7d4cdd82016-08-17 21:39:45 +0200820 # unless (Cache::FastMmap->new(
821 # share_file => $cache_file,
822 # cache_size => $cache_size,
823 # init_file => $cache_init
824 # )) {
825 # print "Unable to intialize cache '$cache_file'\n\n";
826 # exit(1);
827 # };
Akron11c80302016-03-18 19:44:43 +0100828
Akron486f9ab2017-04-22 23:25:19 +0200829
Akron941c1a62016-02-23 17:41:41 +0100830 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100831 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200832 # TODO:
833 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100834 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100835 my @dirs;
836 my $dir;
837
Akron7d4cdd82016-08-17 21:39:45 +0200838 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100839 while (1) {
840 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200841 push @dirs, $dir;
842 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100843 };
844 last unless $it->next;
845 };
846
847 print "Start processing ...\n";
848 $t = Benchmark->new;
849 $count = scalar @dirs;
850
851 DIRECTORY_LOOP:
852 for (my $i = 0; $i < $count; $i++) {
853
Akrone1dbc382016-07-08 22:24:52 +0200854 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200855 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200856 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200857 );
Akron941c1a62016-02-23 17:41:41 +0100858
859 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200860 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200861
Akron13d56622016-10-31 14:54:49 +0100862 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200863 $pool->finish(
864 0,
Akronda3097e2017-04-23 19:53:57 +0200865 [
866 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
867 undef,
868 $filename
869 ]
Akron486f9ab2017-04-22 23:25:19 +0200870 );
Akron3ec48972016-08-17 23:24:52 +0200871 }
872 else {
Akron4c0cf312016-10-15 16:42:09 +0200873 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200874 };
Akron941c1a62016-02-23 17:41:41 +0100875 };
876 }
877
878 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200879 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200880
Akron941c1a62016-02-23 17:41:41 +0100881 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200882 $log->error("Unzip is not installed or incompatible.");
883 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100884 };
885
Akron08385f62016-03-22 20:37:04 +0100886 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200887 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100888
Akron31a08cb2019-02-20 20:43:26 +0100889 # Get sigles to extract
890 my $prefix = set_sigle($archive);
891
Akron941c1a62016-02-23 17:41:41 +0100892 print "Start processing ...\n";
893 $t = Benchmark->new;
894 my @dirs = $archive->list_texts;
895 $count = scalar @dirs;
896
897 ARCHIVE_LOOP:
898 for (my $i = 0; $i < $count; $i++) {
899
900 # Split path information
901 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
902
Akrone1dbc382016-07-08 22:24:52 +0200903 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200904 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200905 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200906 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200907 catfile($corpus, $doc, $text)
908 . '.json' . ($gzip ? '.gz' : '')
909 )
Akrone1dbc382016-07-08 22:24:52 +0200910 );
Akron941c1a62016-02-23 17:41:41 +0100911
912 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200913 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100914
Akron4c0cf312016-10-15 16:42:09 +0200915 # Create temporary file
916 $temp = File::Temp->newdir;
917
Akronbdf434a2016-10-24 17:42:07 +0200918 # TODO: Check if $filename exist at the beginning,
919 # because extraction can be horrible slow!
920
Akron941c1a62016-02-23 17:41:41 +0100921 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100922 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100923
Akron7d4cdd82016-08-17 21:39:45 +0200924 # Create corpus directory
925 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100926
Akron7d4cdd82016-08-17 21:39:45 +0200927 # Temporary directory
928 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100929
Akron7d4cdd82016-08-17 21:39:45 +0200930 # Write file
Akron13d56622016-10-31 14:54:49 +0100931 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200932
Akron4c0cf312016-10-15 16:42:09 +0200933 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100934 $pool->finish(
935 0,
Akronda3097e2017-04-23 19:53:57 +0200936 [
937 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
938 $temp,
939 $filename
940 ]
Akron13d56622016-10-31 14:54:49 +0100941 );
Akron7d4cdd82016-08-17 21:39:45 +0200942 }
943 else {
Akron4c0cf312016-10-15 16:42:09 +0200944 # Delete temporary file
945 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200946 };
Akron941c1a62016-02-23 17:41:41 +0100947 }
Akron7d4cdd82016-08-17 21:39:45 +0200948
949 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100950 else {
Akron4c0cf312016-10-15 16:42:09 +0200951 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100952 };
953 };
954 }
955
956 else {
957 print "Input is neither a directory nor an archive.\n\n";
958 };
959
960 $pool->wait_all_children;
961
Akron11c80302016-03-18 19:44:43 +0100962 # Delete cache file
963 unlink($cache_file) if $cache_delete;
964
Akronda3097e2017-04-23 19:53:57 +0200965 # Close tar filehandle
966 if ($to_tar && $tar_fh) {
967 $tar_archive->finish;
968 $tar_fh->close;
969 print "Wrote to tar archive.\n";
970 };
971
Akron63f20d42017-04-10 23:40:29 +0200972 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100973 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200974};
Akron941c1a62016-02-23 17:41:41 +0100975
Nils Diewald2db9ad02013-10-29 19:26:43 +0000976
Akron31a08cb2019-02-20 20:43:26 +0100977# For an archive, this will create the list
978# of all sigles to process
979sub set_sigle {
980 my $archive = shift;
981
982 my $prefix = 1;
983 my @dirs = ();
984
985 # No sigles given
986 unless (@sigle) {
987
988 # Get files
989 foreach ($archive->list_texts) {
990
991 push @dirs, $_;
992
993 # Split path information
994 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
995
996 # TODO: Make this OS independent
997 push @sigle, join '/', $corpus, $doc, $text;
998 };
999 }
1000
1001 # Check sigle for doc sigles
1002 else {
1003 my @new_sigle;
1004
1005 my $prefix_check = 0;
1006
1007 # Iterate over all sigle
1008 foreach (@sigle) {
1009
1010 # Sigle is a doc sigle
1011 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1012
1013 print "$_ ...";
1014 # Check if a prefix is needed
1015 unless ($prefix_check) {
1016
1017 if ($prefix = $archive->check_prefix) {
1018 print " with prefix ...";
1019 };
1020 $prefix_check = 1;
1021 };
1022
1023 print "\n";
1024
Akron31a08cb2019-02-20 20:43:26 +01001025 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001026 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1027 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001028 );
1029 print "extracted.\n";
1030 }
1031
1032 # Sigle is a text sigle
1033 else {
1034 push @new_sigle, $_;
1035
1036 unless ($prefix_check) {
1037
1038 if ($prefix = $archive->check_prefix) {
1039 print " with prefix ...";
1040 };
1041 $prefix_check = 1;
1042 };
1043 };
1044 };
1045 @sigle = @new_sigle;
1046 };
1047
1048 return $prefix;
1049};
1050
1051
Akron63f20d42017-04-10 23:40:29 +02001052# Cleanup temporary extraction directory
1053if ($extract_dir) {
1054 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001055 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001056};
1057
1058
1059print "\n";
1060
Nils Diewald2db9ad02013-10-29 19:26:43 +00001061__END__
Akron941c1a62016-02-23 17:41:41 +01001062
1063=pod
1064
1065=encoding utf8
1066
1067=head1 NAME
1068
Akron42f48c12020-02-14 13:08:13 +01001069korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001070
1071
1072=head1 SYNOPSIS
1073
Akrona76d8352016-10-27 16:27:32 +02001074 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001075
Akron2fd402b2016-10-27 21:26:48 +02001076
Akron941c1a62016-02-23 17:41:41 +01001077=head1 DESCRIPTION
1078
1079L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1080compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001081The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001082
1083
1084=head1 INSTALLATION
1085
1086The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1087
Akronaf386982016-10-12 00:33:25 +02001088 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001089
Akronc13a1702016-03-15 19:33:14 +01001090In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001091be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001092Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akroneb370a02022-02-24 13:33:40 +01001093Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1094Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001095In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001096
1097=head1 ARGUMENTS
1098
Akrona76d8352016-10-27 16:27:32 +02001099 $ korapxml2krill -z --input <directory> --output <filename>
1100
1101Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001102It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001103
Akron941c1a62016-02-23 17:41:41 +01001104=over 2
1105
1106=item B<archive>
1107
Akron081639e2017-04-21 19:01:39 +02001108 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001109
Akron2fd402b2016-10-27 21:26:48 +02001110Converts an archive of KorAP-XML documents. It expects a directory
1111(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001112
1113=item B<extract>
1114
Akrona76d8352016-10-27 16:27:32 +02001115 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1116
1117Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001118
Akron63f20d42017-04-10 23:40:29 +02001119=item B<serial>
1120
1121 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1122
1123Convert archives sequentially. The inputs are not merged but treated
1124as they are (so they may be premerged or globs).
1125the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001126are created based on the archive name. In case the C<--to-tar> flag is given,
1127the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001128
1129
Akron84b53ad2022-01-14 12:39:15 +01001130=item B<slimlog>
1131
1132 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1133
1134Filters out all useless aka succesfull information from logs, to simplify
1135log checks. Expects no further options.
1136
1137
Akron941c1a62016-02-23 17:41:41 +01001138=back
1139
1140
1141=head1 OPTIONS
1142
1143=over 2
1144
Akrona76d8352016-10-27 16:27:32 +02001145=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001146
Akrona76d8352016-10-27 16:27:32 +02001147Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001148
Akron7606afa2016-10-25 16:23:49 +02001149Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001150document, while C<archive> expects a KorAP-XML corpus folder or a zip
1151file to batch process multiple files.
1152C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001153
Akrona76d8352016-10-27 16:27:32 +02001154C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001155that the first archive listed contains all primary data files
1156and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001157
Akron7606afa2016-10-25 16:23:49 +02001158 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001159
Akron821db3d2017-04-06 21:19:31 +02001160Input may also be defined using BSD glob wildcards.
1161
1162 -i 'file/news*.zip'
1163
1164The extended input array will be sorted in length order, so the shortest
1165path needs to contain all primary data files and all meta data files.
1166
Akron0c3e3752016-06-28 15:55:53 +02001167(The directory structure follows the base directory format,
1168that may include a C<.> root folder.
1169In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001170need to be passed with a hash sign in front of the archive's name.
1171This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001172
Akron7606afa2016-10-25 16:23:49 +02001173To support zip files, a version of C<unzip> needs to be installed that is
1174compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001175
Akron7606afa2016-10-25 16:23:49 +02001176B<The root folder switch using the hash sign is experimental and
1177may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001178
Akronf73ffb62018-06-27 12:13:59 +02001179
Akron63f20d42017-04-10 23:40:29 +02001180=item B<--input-base|-ib> <directory>
1181
1182The base directory for inputs.
1183
1184
Akron941c1a62016-02-23 17:41:41 +01001185=item B<--output|-o> <directory|file>
1186
1187Output folder for archive processing or
1188document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001189writes to C<STDOUT> by default
1190(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001191
1192=item B<--overwrite|-w>
1193
1194Overwrite files that already exist.
1195
Akronf73ffb62018-06-27 12:13:59 +02001196
Akron3741f8b2016-12-21 19:55:21 +01001197=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001198
1199Define the default tokenization by specifying
1200the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001201of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001202This will directly take the file instead of running
1203the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001204
Akron3741f8b2016-12-21 19:55:21 +01001205
1206=item B<--base-sentences|-bs> <foundry>#<layer>
1207
1208Define the layer for base sentences.
1209If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001210Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1211layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001212
1213 Defaults to unset.
1214
1215
1216=item B<--base-paragraphs|-bp> <foundry>#<layer>
1217
1218Define the layer for base paragraphs.
1219If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001220Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1221layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001222
1223 Defaults to unset.
1224
1225
Akron41ac10b2017-02-08 22:47:25 +01001226=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1227
1228Define the layer for base pagebreaks.
1229Currently C<DeReKo#Structure> is the only layer supported.
1230
1231 Defaults to unset.
1232
1233
Akron941c1a62016-02-23 17:41:41 +01001234=item B<--skip|-s> <foundry>[#<layer>]
1235
Akronf7ad89e2016-03-16 18:22:47 +01001236Skip specific annotations by specifying the foundry
1237(and optionally the layer with a C<#>-prefix),
1238e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001239Can be set multiple times.
1240
Akronf73ffb62018-06-27 12:13:59 +02001241
Akronc13a1702016-03-15 19:33:14 +01001242=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001243
Akronf7ad89e2016-03-16 18:22:47 +01001244Convert specific annotations by specifying the foundry
1245(and optionally the layer with a C<#>-prefix),
1246e.g. C<Mate> or C<Mate#Morpho>.
1247Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001248
Akronf73ffb62018-06-27 12:13:59 +02001249
Akroned9baf02019-01-22 17:03:25 +01001250=item B<--non-word-tokens|-nwt>
1251
1252Tokenize non-word tokens like word tokens (defined as matching
1253C</[\d\w]/>). Useful to treat punctuations as tokens.
1254
1255 Defaults to unset.
1256
Akronf1849aa2019-12-16 23:35:33 +01001257
1258=item B<--non-verbal-tokens|-nvt>
1259
1260Tokenize non-verbal tokens marked as in the primary data as
1261the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1262
1263 Defaults to unset.
1264
1265
Akron941c1a62016-02-23 17:41:41 +01001266=item B<--jobs|-j>
1267
1268Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001269for archive processing.
Akron11c80302016-03-18 19:44:43 +01001270Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001271
1272If C<sequential-extraction> is not set to false, this will
1273also apply to extraction.
1274
Akronc11f7982017-02-21 21:20:14 +01001275Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001276times the number of available cores, in case L<Sys::Info>
1277is available.
Akronf7ad89e2016-03-16 18:22:47 +01001278This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001279
Akronf73ffb62018-06-27 12:13:59 +02001280
Akron263274c2019-02-07 09:48:30 +01001281=item B<--koral|-k>
1282
1283Version of the output format. Supported versions are:
1284C<0> for legacy serialization, C<0.03> for serialization
1285with metadata fields as key-values on the root object,
1286C<0.4> for serialization with metadata fields as a list
1287of C<"@type":"koral:field"> objects.
1288
1289Currently defaults to C<0.03>.
1290
1291
Akron9ec88872017-04-12 16:29:06 +02001292=item B<--sequential-extraction|-se>
1293
1294Flag to indicate, if the C<jobs> value also applies to extraction.
1295Some systems may have problems with extracting multiple archives
1296to the same folder at the same time.
1297Can be flagged using C<--no-sequential-extraction> as well.
1298Defaults to C<false>.
1299
Akronf73ffb62018-06-27 12:13:59 +02001300
Akron35db6e32016-03-17 22:42:22 +01001301=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001302
Akron35db6e32016-03-17 22:42:22 +01001303Define the metadata parser to use. Defaults to C<I5>.
1304Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1305This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001306
Akronf73ffb62018-06-27 12:13:59 +02001307
Akron941c1a62016-02-23 17:41:41 +01001308=item B<--gzip|-z>
1309
Akronf7ad89e2016-03-16 18:22:47 +01001310Compress the output.
1311Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001312
Akronf73ffb62018-06-27 12:13:59 +02001313
Akron11c80302016-03-18 19:44:43 +01001314=item B<--cache|-c>
1315
1316File to mmap a cache (using L<Cache::FastMmap>).
1317Defaults to C<korapxml2krill.cache> in the calling directory.
1318
Akronf73ffb62018-06-27 12:13:59 +02001319
Akron11c80302016-03-18 19:44:43 +01001320=item B<--cache-size|-cs>
1321
1322Size of the cache. Defaults to C<50m>.
1323
Akronf73ffb62018-06-27 12:13:59 +02001324
Akron11c80302016-03-18 19:44:43 +01001325=item B<--cache-init|-ci>
1326
1327Initialize cache file.
1328Can be flagged using C<--no-cache-init> as well.
1329Defaults to C<true>.
1330
Akronf73ffb62018-06-27 12:13:59 +02001331
Akron11c80302016-03-18 19:44:43 +01001332=item B<--cache-delete|-cd>
1333
1334Delete cache file after processing.
1335Can be flagged using C<--no-cache-delete> as well.
1336Defaults to C<true>.
1337
Akronf73ffb62018-06-27 12:13:59 +02001338
Akron636aa112017-04-07 18:48:56 +02001339=item B<--config|-cfg>
1340
1341Configure the parameters of your call in a file
1342of key-value pairs with whitespace separator
1343
1344 overwrite 1
1345 token DeReKo#Structure
1346 ...
1347
1348Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001349C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001350C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001351C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001352C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001353C<base-sentences>, C<base-paragraphs>,
1354C<base-pagebreaks>,
1355C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001356(semicolon separated), C<anno> (semicolon separated).
1357
Akronf73ffb62018-06-27 12:13:59 +02001358Configuration parameters will always be overwritten by
1359passed parameters.
1360
1361
Akron81500102017-04-07 20:45:44 +02001362=item B<--temporary-extract|-te>
1363
1364Only valid for the C<archive> command.
1365
1366This will first extract all files into a
1367directory and then will archive.
1368If the directory is given as C<:temp:>,
1369a temporary directory is used.
1370This is especially useful to avoid
1371massive unzipping and potential
1372network latency.
Akron636aa112017-04-07 18:48:56 +02001373
Akronf73ffb62018-06-27 12:13:59 +02001374
Akronc93a0802019-07-11 15:48:34 +02001375=item B<--to-tar>
1376
1377Only valid for the C<archive> command.
1378
1379Writes the output into a tar archive.
1380
1381
Akrone10ad322016-02-27 10:54:26 +01001382=item B<--sigle|-sg>
1383
Akron20807582016-10-26 17:11:34 +02001384Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001385Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001386I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001387Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001388In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001389On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001390
Akron64f7fae2022-07-27 12:45:33 +02001391=item B<--lang>
1392
1393Preferred language for metadata fields. In case multiple titles are
1394given (on any level) with different C<xml:lang> attributes,
1395the language given is preferred.
1396Because titles may have different sources and different priorities,
1397non-specific language titles may still be preferred in case the title
1398source has a higher priority.
1399
Akronf73ffb62018-06-27 12:13:59 +02001400
Akron941c1a62016-02-23 17:41:41 +01001401=item B<--log|-l>
1402
Akronb9c33812020-10-21 16:19:35 +02001403The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001404
Akronf73ffb62018-06-27 12:13:59 +02001405
Akron941c1a62016-02-23 17:41:41 +01001406=item B<--help|-h>
1407
Akron42f48c12020-02-14 13:08:13 +01001408Print help information.
Akron941c1a62016-02-23 17:41:41 +01001409
Akronf73ffb62018-06-27 12:13:59 +02001410
Akron941c1a62016-02-23 17:41:41 +01001411=item B<--version|-v>
1412
1413Print version information.
1414
1415=back
1416
Akronf73ffb62018-06-27 12:13:59 +02001417
Akronc13a1702016-03-15 19:33:14 +01001418=head1 ANNOTATION SUPPORT
1419
1420L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1421developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1422The base foundry with paragraphs, sentences, and the text element are mandatory for
1423L<Krill|https://github.com/KorAP/Krill>.
1424
Akron821db3d2017-04-06 21:19:31 +02001425 Base
1426 #Paragraphs
1427 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001428
Akron821db3d2017-04-06 21:19:31 +02001429 Connexor
1430 #Morpho
1431 #Phrase
1432 #Sentences
1433 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001434
Akron821db3d2017-04-06 21:19:31 +02001435 CoreNLP
1436 #Constituency
1437 #Morpho
1438 #NamedEntities
1439 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001440
Akronce125b62017-06-19 11:54:36 +02001441 CMC
1442 #Morpho
1443
Akron821db3d2017-04-06 21:19:31 +02001444 DeReKo
1445 #Structure
Akronc13a1702016-03-15 19:33:14 +01001446
Akron57510c12019-01-04 14:58:53 +01001447 DGD
1448 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001449 #Structure
Akron57510c12019-01-04 14:58:53 +01001450
Akron821db3d2017-04-06 21:19:31 +02001451 DRuKoLa
1452 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001453
Akron821db3d2017-04-06 21:19:31 +02001454 Glemm
1455 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001456
Akronabb36902021-10-11 15:51:06 +02001457 Gingko
1458 #Morpho
1459
Akronea1aed52018-07-19 14:43:34 +02001460 HNC
1461 #Morpho
1462
Akron4c679192018-01-16 17:41:49 +01001463 LWC
1464 #Dependency
1465
Akron821db3d2017-04-06 21:19:31 +02001466 Malt
1467 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001468
Akron821db3d2017-04-06 21:19:31 +02001469 MarMoT
1470 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001471
Akron821db3d2017-04-06 21:19:31 +02001472 Mate
1473 #Dependency
1474 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001475
Akron821db3d2017-04-06 21:19:31 +02001476 MDParser
1477 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001478
Akrone85a7762022-07-22 08:05:03 +02001479 NKJP
1480 #Morpho
1481 #NamedEntities
1482
Akron821db3d2017-04-06 21:19:31 +02001483 OpenNLP
1484 #Morpho
1485 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001486
Akron07e24772020-04-23 14:00:54 +02001487 RWK
1488 #Morpho
1489 #Structure
1490
Akron821db3d2017-04-06 21:19:31 +02001491 Sgbr
1492 #Lemma
1493 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001494
Akron7d5e6382019-08-08 16:36:27 +02001495 Talismane
1496 #Dependency
1497 #Morpho
1498
Akron821db3d2017-04-06 21:19:31 +02001499 TreeTagger
1500 #Morpho
1501 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001502
Akron83aedd32023-02-07 10:57:41 +01001503 UDPipe
1504 #Dependency
1505 #Morpho
1506
Akron821db3d2017-04-06 21:19:31 +02001507 XIP
1508 #Constituency
1509 #Morpho
1510 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001511
Akronc13a1702016-03-15 19:33:14 +01001512
1513More importers are in preparation.
1514New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1515See the built-in annotation importers as examples.
1516
Akronf73ffb62018-06-27 12:13:59 +02001517
Akron41e6c8b2021-10-14 20:22:18 +02001518=head1 METADATA SUPPORT
1519
1520L<KorAP::XML::Krill> has built-in importer for some meta data variants
1521developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1522
1523=over 2
1524
1525=item I5 - Meta data for all I5 files
1526
1527=item Sgbr - Meta data from the Schreibgebrauch project
1528
1529=item Gingko - Meta data from the Gingko project in addition to I5
1530
1531=back
1532
1533More importers are in preparation.
1534New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1535See the built-in meta data importers as examples.
1536
1537
Akron8f69d632020-01-15 16:58:11 +01001538=head1 About KorAP-XML
1539
1540KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1541data model (Bański et al. 2013), where text data are stored physically
1542separated from their interpretations (i.e. annotations).
1543A text document in KorAP-XML therefore consists of several files
1544containing primary data, metadata and annotations.
1545
1546The structure of a single KorAP-XML document can be as follows:
1547
1548 - data.xml
1549 - header.xml
1550 + base
1551 - tokens.xml
1552 - ...
1553 + struct
1554 - structure.xml
1555 - ...
1556 + corenlp
1557 - morpho.xml
1558 - constituency.xml
1559 - ...
1560 + tree_tagger
1561 - morpho.xml
1562 - ...
1563 - ...
1564
1565The C<data.xml> contains the primary data, the C<header.xml> contains
1566the metadata, and the annotation layers are stored in subfolders
1567like C<base>, C<struct> or C<corenlp>
1568(so-called "foundries"; Bański et al. 2013).
1569
1570Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001571(Lüngen and Sperberg-McQueen 2012). See the documentation in
1572L<KorAP::XML::Meta::I5> for translatable fields.
1573
1574Annotations correspond to a variant of the TEI-P5 feature structures
1575(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001576Annotation feature structures refer to character sequences of the primary text
1577inside the C<text> element of the C<data.xml>.
1578A single annotation containing the lemma of a token can have the following structure:
1579
1580 <span from="0" to="3">
1581 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1582 <f name="lex">
1583 <fs>
1584 <f name="lemma">zum</f>
1585 </fs>
1586 </f>
1587 </fs>
1588 </span>
1589
1590The C<from> and C<to> attributes are refering to the character span
1591in the primary text.
1592Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1593the structure may vary. See L<KorAP::XML::Annotation::*> for various
1594annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001595
1596Multiple KorAP-XML documents are organized on three levels following
1597the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1598corpus E<gt> document E<gt> text. On each level metadata information
1599can be stored, that C<korapxml2krill> will merge to a single metadata
1600object per text. A corpus is therefore structured as follows:
1601
1602 + <corpus>
1603 - header.xml
1604 + <document>
1605 - header.xml
1606 + <text>
1607 - data.xml
1608 - header.xml
1609 - ...
1610 - ...
1611
1612A single text can be identified by the concatenation of
1613the corpus identifier, the document identifier and the text identifier.
1614This identifier is called the text sigle
1615(e.g. a text with the identifier C<18486> in the document C<060> in the
1616corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1617
1618These corpora are often stored in zip files, with which C<korapxml2krill>
1619can deal with. Corpora may also be split in multiple zip archives
1620(e.g. one zip file per foundry), which is also supported (see C<--input>).
1621
1622Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1623in form of a test suite.
1624The resulting JSON format merges all annotation layers
1625based on a single token stream.
1626
1627=head2 References
1628
1629Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1630KorAP data model: first approximation, December.
1631
1632Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1633"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1634Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1635L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1636
1637Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1638"Robust corpus architecture: a new look at virtual collections and data access",
1639Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1640L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1641
1642Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1643Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1644"Towards an international standard on featurestructure representation",
1645Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1646pp. 373-376.
1647L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1648
1649Harald Lüngen and C. M. Sperberg-McQueen (2012):
1650"A TEI P5 Document Grammar for the IDS Text Model",
1651Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1652L<PDF|https://journals.openedition.org/jtei/pdf/508>
1653
1654TEI Consortium, eds:
1655"Feature Structures",
1656Guidelines for Electronic Text Encoding and Interchange.
1657L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1658
Akron941c1a62016-02-23 17:41:41 +01001659=head1 AVAILABILITY
1660
1661 https://github.com/KorAP/KorAP-XML-Krill
1662
1663
1664=head1 COPYRIGHT AND LICENSE
1665
Akron83aedd32023-02-07 10:57:41 +01001666Copyright (C) 2015-2023, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001667
Akron6882d7d2021-02-08 09:43:57 +01001668Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001669
Akrona76d8352016-10-27 16:27:32 +02001670Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001671
Akron6882d7d2021-02-08 09:43:57 +01001672L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001673Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001674L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001675member of the
Akronf1849aa2019-12-16 23:35:33 +01001676L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001677
1678This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001679L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001680
1681=cut