blob: 7e1964451b510509db76b15544d6b0907c0ec40a [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron64f7fae2022-07-27 12:45:33 +0200164#
165# 2022/07/27
166# - Support for preferred language transformation
Akron941c1a62016-02-23 17:41:41 +0100167# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100168
Akron64f7fae2022-07-27 12:45:33 +0200169our $LAST_CHANGE = '2022/07/27';
Akron941c1a62016-02-23 17:41:41 +0100170our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100171our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100172our $VERSION_MSG = <<"VERSION";
173Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
174VERSION
175
Akron941c1a62016-02-23 17:41:41 +0100176# Parse comand
177my $cmd;
178our @ARGV;
179if ($ARGV[0] && index($ARGV[0], '-') != 0) {
180 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100181};
Akron63f20d42017-04-10 23:40:29 +0200182my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100183
Akron5f51d422016-08-16 16:26:43 +0200184my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200185
186# Configuration hash
187my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100188
Akron941c1a62016-02-23 17:41:41 +0100189# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000190GetOptions(
Akron08385f62016-03-22 20:37:04 +0100191 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200192 'input-base|ib=s' => \($cfg{input_base}),
193 'output|o=s' => \($cfg{output}),
194 'overwrite|w' => \($cfg{overwrite}),
195 'meta|m=s' => \($cfg{meta}),
196 'token|t=s' => \($cfg{token}),
197 'base-sentences|bs=s' => \($cfg{base_sentences}),
198 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
199 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
200 'gzip|z' => \($cfg{gzip}),
201 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100202 'skip|s=s' => \@skip,
203 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200204 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200205 'config|cfg=s' => \(my $cfg_file),
Akron64f7fae2022-07-27 12:45:33 +0200206 'lang=s' => \($cfg{lang}),
Akronf8df2162020-08-07 15:03:39 +0200207 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200208 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200209 'primary|p!' => sub {
210 warn 'Primary flag no longer supported!';
211 },
Akron6aed0562020-08-07 16:46:00 +0200212 'pretty|y' => sub {
213 warn 'Pretty flag no longer supported!';
214 },
Akronf8df2162020-08-07 15:03:39 +0200215 'jobs|j=i' => \($cfg{jobs}),
216 'koral|k=f' => \($cfg{koral}),
217 'to-tar' => \($cfg{to_tar}),
218 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
219 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
220 'sequential-extraction|se' => \($cfg{sequential_extraction}),
221 'cache-size|cs=s' => \($cfg{cache_size}),
222 'cache-delete|cd!' => \($cfg{cache_delete}),
223 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100224 'help|h' => sub {
225 pod2usage(
226 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200227 -verbose => 99,
228 -msg => $VERSION_MSG,
229 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100230 );
231 },
232 'version|v' => sub {
233 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200234 -verbose => 0,
235 -msg => $VERSION_MSG,
236 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100237 )
238 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000239);
240
Akrone512b7c2020-08-07 16:16:12 +0200241my %ERROR_HASH = (
242 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
243 -verbose => 99,
244 -msg => $VERSION_MSG,
245 -output => '-',
246 -exit => 1
247);
Akron63f20d42017-04-10 23:40:29 +0200248
Akronf8df2162020-08-07 15:03:39 +0200249# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200250if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200251 my %config;
252
Akronf8df2162020-08-07 15:03:39 +0200253 print "Reading config from $cfg_file\n";
254
Akron636aa112017-04-07 18:48:56 +0200255 Config::Simple->import_from($cfg_file, \%config);
256
Akronf8df2162020-08-07 15:03:39 +0200257 foreach (qw!output cache-size input-base token overwrite
258 meta base-sentences base-paragraphs base-pagebreaks
Akron64f7fae2022-07-27 12:45:33 +0200259 gzip to-tar log lang cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100260 non-verbal-tokens sequential-extraction
261 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200262 koral extract-dir jobs!) {
263 my $underlined = $_ =~ tr/-/_/r;
264 if (!defined($cfg{$underlined}) && defined $config{$_}) {
265 $cfg{$underlined} = $config{$_};
266 };
Akron636aa112017-04-07 18:48:56 +0200267 };
268
269 # Skip
270 if (!scalar(@skip) && defined $config{'skip'}) {
271 @skip = split /\s*;\s*/, $config{'skip'} ;
272 };
273
274 # Sigle
275 if (!scalar(@sigle) && defined $config{'sigle'}) {
276 @sigle = split /\s*;\s*/, $config{'sigle'} ;
277 };
278
279 # Anno
280 if (!scalar(@anno) && defined $config{'anno'}) {
281 @anno = split /\s*;\s*/, $config{'anno'} ;
282 };
283};
284
Akronf8df2162020-08-07 15:03:39 +0200285# Init variables and set default values
286my $output = $cfg{output};
287my $input_base = $cfg{input_base};
288my $gzip = $cfg{gzip};
289my $to_tar = $cfg{to_tar};
290my $extract_dir = $cfg{extract_dir};
291my $token_base = $cfg{token} // 'OpenNLP#tokens';
292my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
293my $jobs = $cfg{jobs} // 0;
294my $cache_delete = $cfg{cache_delete} // 1;
295my $base_sentences = lc($cfg{base_sentences} // '');
296my $base_paragraphs = lc($cfg{base_paragraphs} // '');
297my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
298my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200299
Akronf8df2162020-08-07 15:03:39 +0200300# Get tokenization basis
301my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200302
Akronf8df2162020-08-07 15:03:39 +0200303# Remove file extension
304$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100305
Akronf8df2162020-08-07 15:03:39 +0200306# Convert sigle to path construct
307s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
308
309my %skip;
310$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200311
Akronb9c33812020-10-21 16:19:35 +0200312Log::Any::Adapter->set(
313 'Stderr', log_level => uc($cfg{log} // 'ERROR')
314);
Akron63f20d42017-04-10 23:40:29 +0200315
Akron84b53ad2022-01-14 12:39:15 +0100316# Start log slimming
317if ($cmd && $cmd eq 'slimlog') {
318 require KorAP::XML::Log::Slim;
319
320 my $log_file = shift @ARGV;
321
322 if (-e $log_file) {
323
324 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
325
326 # Run log filter
327 $slimmer->slim_to;
328 }
329
330 else {
331 warn "Log file can't be found";
332 exit(1);
333 };
334
335 exit;
336};
337
338
Akronf8df2162020-08-07 15:03:39 +0200339if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
340 $log->error("Directory '$output' does not exist.");
341 exit 1;
342};
Akron63f20d42017-04-10 23:40:29 +0200343
Akron941c1a62016-02-23 17:41:41 +0100344# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100345pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000346
Akrone1dbc382016-07-08 22:24:52 +0200347# Gzip has no effect, if no output is given
348pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349
Akron63f20d42017-04-10 23:40:29 +0200350# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200351if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200352
Akron63f20d42017-04-10 23:40:29 +0200353 # Remove all inputs
354 my $remove_next = 0;
355 @keep_argv = @{c(@keep_argv)->grep(
356 sub {
357 # Input flag
358 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
359 $remove_next = 1;
360 return 0;
361 }
362
363 # input value
364 elsif ($remove_next) {
365 $remove_next = 0;
366 return 0;
367 };
368
369 # Pass parameter
370 return 1;
371 }
372 )->to_array};
373
374
375 # Iterate over all inputs
376 foreach (@input) {
377
Akron081639e2017-04-21 19:01:39 +0200378 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200379 my $new_out = catdir($output, get_file_name_from_glob($_));
380
Akron486f9ab2017-04-22 23:25:19 +0200381 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200382 unless ($to_tar) {
383 if (make_path($new_out) == 0 && !-d $new_out) {
384 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200385 exit 1;
Akron081639e2017-04-21 19:01:39 +0200386 };
Akron63f20d42017-04-10 23:40:29 +0200387 };
388
389 # Create archive command
390 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
391 print "Start serial processing of $_ to $new_out\n";
392
393 # Start archiving
394 system @archive_cmd;
395 };
396
Akron3abc03e2017-06-29 16:23:35 +0200397 exit;
Akron63f20d42017-04-10 23:40:29 +0200398};
399
Akron5c602cb2020-08-07 17:00:52 +0200400# Define supported (and preinstalled) transformation modules
401my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100402push(@layers, ['Base', 'Sentences']) unless $base_sentences;
403push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200404
405# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200406push(@layers, ['Connexor', 'Morpho'],
407 ['Connexor', 'Syntax'],
408 ['Connexor', 'Phrase'],
409 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200410
411# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200412push(@layers,
413 ['CoreNLP', 'NamedEntities'],
414 ['CoreNLP', 'Sentences'],
415 ['CoreNLP', 'Morpho'],
416 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200417
Akronce125b62017-06-19 11:54:36 +0200418# CMC
419push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100420
Akrone1dbc382016-07-08 22:24:52 +0200421# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100422my @dereko_attr = ();
423if ($base_sentences eq 'dereko#structure') {
424 push @dereko_attr, 'sentences';
425};
426if ($base_paragraphs eq 'dereko#structure') {
427 push @dereko_attr, 'paragraphs';
428};
Akron636bd9c2017-02-09 17:13:00 +0100429
Akron41ac10b2017-02-08 22:47:25 +0100430if ($base_pagebreaks eq 'dereko#structure') {
431 push @dereko_attr, 'pagebreaks';
432};
433
434if ($dereko_attr[0]) {
435 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100436}
437else {
438 push(@layers, ['DeReKo', 'Structure']);
439};
Akrone1dbc382016-07-08 22:24:52 +0200440
Akron57510c12019-01-04 14:58:53 +0100441# DGD
442push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100443if ($base_sentences eq 'dgd#structure') {
444 push(@layers, ['DGD', 'Structure', 'base-sentence']);
445}
Akron57510c12019-01-04 14:58:53 +0100446
447# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200448push(@layers,
449 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100450
Akronabb36902021-10-11 15:51:06 +0200451# Gingko
452push(@layers,
453 ['Gingko', 'Morpho']);
454
Akrone1dbc382016-07-08 22:24:52 +0200455# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200456push(@layers,
457 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200458
Akronea1aed52018-07-19 14:43:34 +0200459# HNC
Akron5c602cb2020-08-07 17:00:52 +0200460push(@layers,
461 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200462
Akron4c679192018-01-16 17:41:49 +0100463# LWC
Akron5c602cb2020-08-07 17:00:52 +0200464push(@layers,
465 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100466
Akrone1dbc382016-07-08 22:24:52 +0200467# Malt
Akron5c602cb2020-08-07 17:00:52 +0200468push(@layers,
469 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200470
Akron57510c12019-01-04 14:58:53 +0100471# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200472push(@layers,
473 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200474
475# Mate
Akron5c602cb2020-08-07 17:00:52 +0200476push(@layers,
477 ['Mate', 'Morpho'],
478 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200479
Akron57510c12019-01-04 14:58:53 +0100480# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200481push(@layers,
482 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100483
Akron88d063a2022-03-21 15:10:01 +0100484# NKJP
485push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200486 ['NKJP', 'Morpho'],
487 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100488
Akrone1dbc382016-07-08 22:24:52 +0200489# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200490push(@layers,
491 ['OpenNLP', 'Morpho'],
492 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200493
Akron07e24772020-04-23 14:00:54 +0200494# Redewiedergabe
495push(@layers, ['RWK', 'Morpho']);
496if ($base_sentences eq 'rwk#structure') {
497 push(@layers, ['RWK', 'Structure']);
498};
499
Akrone1dbc382016-07-08 22:24:52 +0200500# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200501push(@layers,
502 ['Sgbr', 'Lemma'],
503 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200504
Akron7d5e6382019-08-08 16:36:27 +0200505# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200506push(@layers,
507 ['Talismane', 'Dependency'],
508 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200509
Akrone1dbc382016-07-08 22:24:52 +0200510# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200511push(@layers,
512 ['TreeTagger', 'Morpho'],
513 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200514
515# XIP
Akron5c602cb2020-08-07 17:00:52 +0200516push(@layers,
517 ['XIP', 'Morpho'],
518 ['XIP', 'Constituency'],
519 ['XIP', 'Sentences'],
520 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200521
Akron4fa37c32017-01-20 14:43:10 +0100522
Akrone1dbc382016-07-08 22:24:52 +0200523# Check filters
524my @filtered_anno;
525if ($skip{'#all'}) {
526 foreach (@anno) {
527 push @filtered_anno, [ split('#', $_) ];
528 };
529}
530
531# Add all annotations that are not skipped
532else {
533 # Add to index file - respect skipping
534 foreach my $info (@layers) {
535 # Skip if Foundry or Foundry#Layer should be skipped
536 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
537 push @filtered_anno, $info;
538 };
539 };
540};
541
Akrone1dbc382016-07-08 22:24:52 +0200542
543# TODO: This should not be initialized for batch
544my $cache = Cache::FastMmap->new(
545 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200546 cache_size => ($cfg{cache_size} // '50m'),
547 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200548);
549
Akron03b24db2016-08-16 20:54:32 +0200550# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200551my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200552 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200553 meta_type => $cfg{meta},
554 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200555 foundry => $token_base_foundry,
556 layer => $token_base_layer,
557 gzip => $gzip,
558 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200559 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100560 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200561 non_word_tokens => ($cfg{non_word_tokens} // 0),
Akron64f7fae2022-07-27 12:45:33 +0200562 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0),
563 lang => $cfg{lang},
Akrone1dbc382016-07-08 22:24:52 +0200564);
565
Akrone512b7c2020-08-07 16:16:12 +0200566# Auto adjust jobs
567if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100568 my $cores = 1;
569 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
570 $cores = Sys::Info->new->device('CPU')->count;
571 }
572 else {
573 $log->warn("Unable to determine number of cores");
574 };
575
Akrone512b7c2020-08-07 16:16:12 +0200576 $jobs = ceil(5 * $cores);
577 $log->info("Run using $jobs jobs on $cores cores");
578};
579
580
Akron63f20d42017-04-10 23:40:29 +0200581# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200582if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200583
Akron821db3d2017-04-06 21:19:31 +0200584 my @new_input = ();
585
586 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200587 foreach my $wild_card (@input) {
588
589 # Prefix with input root
590 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
591
592 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200593 };
594
Akron63f20d42017-04-10 23:40:29 +0200595 # Sort files by length
596 @input = sort { length($a) <=> length($b) } @new_input;
597
598 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200599};
600
601
Akron941c1a62016-02-23 17:41:41 +0100602# Process a single file
603unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100604 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000605
Akron941c1a62016-02-23 17:41:41 +0100606 BEGIN {
607 $main::TIME = Benchmark->new;
608 $main::LAST_STOP = Benchmark->new;
609 };
610
611 sub stop_time {
612 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200613 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100614 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200615 timestr(timediff($new, $main::LAST_STOP)) .
616 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
617 );
Akron941c1a62016-02-23 17:41:41 +0100618 $main::LAST_STOP = $new;
619 };
620
621 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200622 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100623
Akron7d4cdd82016-08-17 21:39:45 +0200624 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200625 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100626
Akron11c80302016-03-18 19:44:43 +0100627 # Delete cache file
628 unlink($cache_file) if $cache_delete;
629
Akron5f51d422016-08-16 16:26:43 +0200630 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200631 exit;
Akron81500102017-04-07 20:45:44 +0200632};
633
Nils Diewald59094f22014-11-05 18:20:50 +0000634
Akrone10ad322016-02-27 10:54:26 +0100635# Extract XML files
Akron81500102017-04-07 20:45:44 +0200636if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100637
Akrond5643ad2017-07-04 20:27:13 +0200638 # Output is required
639 pod2usage(%ERROR_HASH) unless $output;
640
Akron7d4cdd82016-08-17 21:39:45 +0200641 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200642 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100643
Akron7d4cdd82016-08-17 21:39:45 +0200644 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100645 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200646 $log->error("Unzip is not installed or incompatible.");
647 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100648 };
649
Akronb0c88db2016-06-29 16:33:18 +0200650 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200651 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200652
Akron31a08cb2019-02-20 20:43:26 +0100653 # Will set @sigle
654 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200655
Akrone10ad322016-02-27 10:54:26 +0100656 # Iterate over all given sigles and extract
657 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100658
Akron2812ba22016-10-28 21:55:59 +0200659 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200660
Akron03b24db2016-08-16 20:54:32 +0200661 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200662 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100663
Akron955b75b2019-02-21 14:28:41 +0100664 # TODO:
665 # - prefix???
666 $archive->extract_sigle([$_], $output, $jobs)
667 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200668 );
Akrone10ad322016-02-27 10:54:26 +0100669 print "extracted.\n";
670 };
Akronb0c88db2016-06-29 16:33:18 +0200671 }
Akron7d4cdd82016-08-17 21:39:45 +0200672
673 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200674 else {
675 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200676 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100677 };
678}
679
Akron81500102017-04-07 20:45:44 +0200680
Akron941c1a62016-02-23 17:41:41 +0100681# Process an archive
682elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000683
Akron81500102017-04-07 20:45:44 +0200684 my $archive_output;
685
686 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100687 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200688
689 # Create new archive object
690 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
691
692 # Check zip capabilities
693 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200694 $log->error("Unzip is not installed or incompatible.");
695 exit 1;
Akron81500102017-04-07 20:45:44 +0200696 };
697
698 # Add further annotation archived
699 $archive->attach($_) foreach @input[1..$#input];
700
701 # Create a temporary directory
702 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200703 $extract_dir = tempdir(CLEANUP => 0);
704 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200705 };
706
Akron63f20d42017-04-10 23:40:29 +0200707 # Add some random extra to avoid clashes with multiple archives
708 $extract_dir = catdir($extract_dir, random_string('cccccc'));
709
Akron31a08cb2019-02-20 20:43:26 +0100710 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200711 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200712 @input = ($extract_dir);
713 }
714 else {
715 $log->error('Unable to extract from primary archive ' . $input[0] .
716 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200717 exit 1;
Akron81500102017-04-07 20:45:44 +0200718 };
719 }
720
721 # Can't create archive object
722 else {
723 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200724 exit 1;
Akron81500102017-04-07 20:45:44 +0200725 };
726 };
727
Akron7d4cdd82016-08-17 21:39:45 +0200728 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100729 my $pool = Parallel::ForkManager->new($jobs);
730
Akron7d4cdd82016-08-17 21:39:45 +0200731 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100732 my $iter = 1; # Current text in process
733
Akronda3097e2017-04-23 19:53:57 +0200734 my $tar_archive;
735 my $output_dir = $output;
736 my $tar_fh;
737
738 # Initialize tar archive
739 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200740
741 # Set output name
742 my $tar_file = $output;
743 unless ($tar_file =~ /\.tar$/) {
744 $tar_file .= '.tar';
745 };
746
747 # Initiate the tar file
748 print "Writing to file $tar_file\n";
749 $tar_fh = IO::File->new($tar_file, 'w');
750 $tar_fh->binmode(1);
751
Akroneb370a02022-02-24 13:33:40 +0100752 # Use tar builder for archiving
753 if (eval("use Archive::Tar::Builder; 1;")) {
754 $tar_archive = Archive::Tar::Builder->new(
755 ignore_errors => 1
756 );
757
758 # Set handle
759 $tar_archive->set_handle($tar_fh);
760 }
761
762 # Fallback solution
763 else {
764 $tar_archive = KorAP::XML::TarBuilder->new(
765 $tar_fh
766 );
767 };
Akronda3097e2017-04-23 19:53:57 +0200768
769 # Output to temporary directory
770 $output_dir = File::Temp->newdir;
771 };
772
Akron941c1a62016-02-23 17:41:41 +0100773 # Report on fork message
774 $pool->run_on_finish (
775 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200776 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100777 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200778
Akron08385f62016-03-22 20:37:04 +0100779 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200780 ($iter++) . "/$count]" .
781 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200782 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200783
784 if (!$code && $to_tar && $data->[2]) {
785 my $filename = $data->[2];
786
787 # Lock filehandle
788 if (flock($tar_fh, LOCK_EX)) {
789
Akron9a062ce2017-07-04 19:12:05 +0200790 my $clean_file = fileparse($filename);
791
Akronda3097e2017-04-23 19:53:57 +0200792 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200793 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200794 unlink $filename;
795
796 # Unlock filehandle
797 flock($tar_fh, LOCK_UN);
798 }
799 else {
800 $log->warn("Unable to add $filename to archive");
801 };
802 };
803
Akron4c0cf312016-10-15 16:42:09 +0200804 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100805 }
806 );
807
808 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200809 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100810 print "Reading data ...\n";
811
Akron7d4cdd82016-08-17 21:39:45 +0200812 # unless (Cache::FastMmap->new(
813 # share_file => $cache_file,
814 # cache_size => $cache_size,
815 # init_file => $cache_init
816 # )) {
817 # print "Unable to intialize cache '$cache_file'\n\n";
818 # exit(1);
819 # };
Akron11c80302016-03-18 19:44:43 +0100820
Akron486f9ab2017-04-22 23:25:19 +0200821
Akron941c1a62016-02-23 17:41:41 +0100822 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100823 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200824 # TODO:
825 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100826 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100827 my @dirs;
828 my $dir;
829
Akron7d4cdd82016-08-17 21:39:45 +0200830 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100831 while (1) {
832 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200833 push @dirs, $dir;
834 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100835 };
836 last unless $it->next;
837 };
838
839 print "Start processing ...\n";
840 $t = Benchmark->new;
841 $count = scalar @dirs;
842
843 DIRECTORY_LOOP:
844 for (my $i = 0; $i < $count; $i++) {
845
Akrone1dbc382016-07-08 22:24:52 +0200846 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200847 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200848 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200849 );
Akron941c1a62016-02-23 17:41:41 +0100850
851 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200852 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200853
Akron13d56622016-10-31 14:54:49 +0100854 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200855 $pool->finish(
856 0,
Akronda3097e2017-04-23 19:53:57 +0200857 [
858 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
859 undef,
860 $filename
861 ]
Akron486f9ab2017-04-22 23:25:19 +0200862 );
Akron3ec48972016-08-17 23:24:52 +0200863 }
864 else {
Akron4c0cf312016-10-15 16:42:09 +0200865 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200866 };
Akron941c1a62016-02-23 17:41:41 +0100867 };
868 }
869
870 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200871 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200872
Akron941c1a62016-02-23 17:41:41 +0100873 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200874 $log->error("Unzip is not installed or incompatible.");
875 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100876 };
877
Akron08385f62016-03-22 20:37:04 +0100878 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200879 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100880
Akron31a08cb2019-02-20 20:43:26 +0100881 # Get sigles to extract
882 my $prefix = set_sigle($archive);
883
Akron941c1a62016-02-23 17:41:41 +0100884 print "Start processing ...\n";
885 $t = Benchmark->new;
886 my @dirs = $archive->list_texts;
887 $count = scalar @dirs;
888
889 ARCHIVE_LOOP:
890 for (my $i = 0; $i < $count; $i++) {
891
892 # Split path information
893 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
894
Akrone1dbc382016-07-08 22:24:52 +0200895 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200896 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200897 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200898 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200899 catfile($corpus, $doc, $text)
900 . '.json' . ($gzip ? '.gz' : '')
901 )
Akrone1dbc382016-07-08 22:24:52 +0200902 );
Akron941c1a62016-02-23 17:41:41 +0100903
904 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200905 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100906
Akron4c0cf312016-10-15 16:42:09 +0200907 # Create temporary file
908 $temp = File::Temp->newdir;
909
Akronbdf434a2016-10-24 17:42:07 +0200910 # TODO: Check if $filename exist at the beginning,
911 # because extraction can be horrible slow!
912
Akron941c1a62016-02-23 17:41:41 +0100913 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100914 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100915
Akron7d4cdd82016-08-17 21:39:45 +0200916 # Create corpus directory
917 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100918
Akron7d4cdd82016-08-17 21:39:45 +0200919 # Temporary directory
920 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100921
Akron7d4cdd82016-08-17 21:39:45 +0200922 # Write file
Akron13d56622016-10-31 14:54:49 +0100923 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200924
Akron4c0cf312016-10-15 16:42:09 +0200925 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100926 $pool->finish(
927 0,
Akronda3097e2017-04-23 19:53:57 +0200928 [
929 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
930 $temp,
931 $filename
932 ]
Akron13d56622016-10-31 14:54:49 +0100933 );
Akron7d4cdd82016-08-17 21:39:45 +0200934 }
935 else {
Akron4c0cf312016-10-15 16:42:09 +0200936 # Delete temporary file
937 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200938 };
Akron941c1a62016-02-23 17:41:41 +0100939 }
Akron7d4cdd82016-08-17 21:39:45 +0200940
941 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100942 else {
Akron4c0cf312016-10-15 16:42:09 +0200943 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100944 };
945 };
946 }
947
948 else {
949 print "Input is neither a directory nor an archive.\n\n";
950 };
951
952 $pool->wait_all_children;
953
Akron11c80302016-03-18 19:44:43 +0100954 # Delete cache file
955 unlink($cache_file) if $cache_delete;
956
Akronda3097e2017-04-23 19:53:57 +0200957 # Close tar filehandle
958 if ($to_tar && $tar_fh) {
959 $tar_archive->finish;
960 $tar_fh->close;
961 print "Wrote to tar archive.\n";
962 };
963
Akron63f20d42017-04-10 23:40:29 +0200964 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100965 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200966};
Akron941c1a62016-02-23 17:41:41 +0100967
Nils Diewald2db9ad02013-10-29 19:26:43 +0000968
Akron31a08cb2019-02-20 20:43:26 +0100969# For an archive, this will create the list
970# of all sigles to process
971sub set_sigle {
972 my $archive = shift;
973
974 my $prefix = 1;
975 my @dirs = ();
976
977 # No sigles given
978 unless (@sigle) {
979
980 # Get files
981 foreach ($archive->list_texts) {
982
983 push @dirs, $_;
984
985 # Split path information
986 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
987
988 # TODO: Make this OS independent
989 push @sigle, join '/', $corpus, $doc, $text;
990 };
991 }
992
993 # Check sigle for doc sigles
994 else {
995 my @new_sigle;
996
997 my $prefix_check = 0;
998
999 # Iterate over all sigle
1000 foreach (@sigle) {
1001
1002 # Sigle is a doc sigle
1003 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1004
1005 print "$_ ...";
1006 # Check if a prefix is needed
1007 unless ($prefix_check) {
1008
1009 if ($prefix = $archive->check_prefix) {
1010 print " with prefix ...";
1011 };
1012 $prefix_check = 1;
1013 };
1014
1015 print "\n";
1016
Akron31a08cb2019-02-20 20:43:26 +01001017 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001018 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1019 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001020 );
1021 print "extracted.\n";
1022 }
1023
1024 # Sigle is a text sigle
1025 else {
1026 push @new_sigle, $_;
1027
1028 unless ($prefix_check) {
1029
1030 if ($prefix = $archive->check_prefix) {
1031 print " with prefix ...";
1032 };
1033 $prefix_check = 1;
1034 };
1035 };
1036 };
1037 @sigle = @new_sigle;
1038 };
1039
1040 return $prefix;
1041};
1042
1043
Akron63f20d42017-04-10 23:40:29 +02001044# Cleanup temporary extraction directory
1045if ($extract_dir) {
1046 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001047 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001048};
1049
1050
1051print "\n";
1052
Nils Diewald2db9ad02013-10-29 19:26:43 +00001053__END__
Akron941c1a62016-02-23 17:41:41 +01001054
1055=pod
1056
1057=encoding utf8
1058
1059=head1 NAME
1060
Akron42f48c12020-02-14 13:08:13 +01001061korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001062
1063
1064=head1 SYNOPSIS
1065
Akrona76d8352016-10-27 16:27:32 +02001066 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001067
Akron2fd402b2016-10-27 21:26:48 +02001068
Akron941c1a62016-02-23 17:41:41 +01001069=head1 DESCRIPTION
1070
1071L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1072compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001073The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001074
1075
1076=head1 INSTALLATION
1077
1078The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1079
Akronaf386982016-10-12 00:33:25 +02001080 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001081
Akronc13a1702016-03-15 19:33:14 +01001082In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001083be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001084Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akroneb370a02022-02-24 13:33:40 +01001085Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1086Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001087In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001088
1089=head1 ARGUMENTS
1090
Akrona76d8352016-10-27 16:27:32 +02001091 $ korapxml2krill -z --input <directory> --output <filename>
1092
1093Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001094It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001095
Akron941c1a62016-02-23 17:41:41 +01001096=over 2
1097
1098=item B<archive>
1099
Akron081639e2017-04-21 19:01:39 +02001100 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001101
Akron2fd402b2016-10-27 21:26:48 +02001102Converts an archive of KorAP-XML documents. It expects a directory
1103(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001104
1105=item B<extract>
1106
Akrona76d8352016-10-27 16:27:32 +02001107 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1108
1109Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001110
Akron63f20d42017-04-10 23:40:29 +02001111=item B<serial>
1112
1113 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1114
1115Convert archives sequentially. The inputs are not merged but treated
1116as they are (so they may be premerged or globs).
1117the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001118are created based on the archive name. In case the C<--to-tar> flag is given,
1119the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001120
1121
Akron84b53ad2022-01-14 12:39:15 +01001122=item B<slimlog>
1123
1124 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1125
1126Filters out all useless aka succesfull information from logs, to simplify
1127log checks. Expects no further options.
1128
1129
Akron941c1a62016-02-23 17:41:41 +01001130=back
1131
1132
1133=head1 OPTIONS
1134
1135=over 2
1136
Akrona76d8352016-10-27 16:27:32 +02001137=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001138
Akrona76d8352016-10-27 16:27:32 +02001139Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001140
Akron7606afa2016-10-25 16:23:49 +02001141Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001142document, while C<archive> expects a KorAP-XML corpus folder or a zip
1143file to batch process multiple files.
1144C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001145
Akrona76d8352016-10-27 16:27:32 +02001146C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001147that the first archive listed contains all primary data files
1148and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001149
Akron7606afa2016-10-25 16:23:49 +02001150 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001151
Akron821db3d2017-04-06 21:19:31 +02001152Input may also be defined using BSD glob wildcards.
1153
1154 -i 'file/news*.zip'
1155
1156The extended input array will be sorted in length order, so the shortest
1157path needs to contain all primary data files and all meta data files.
1158
Akron0c3e3752016-06-28 15:55:53 +02001159(The directory structure follows the base directory format,
1160that may include a C<.> root folder.
1161In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001162need to be passed with a hash sign in front of the archive's name.
1163This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001164
Akron7606afa2016-10-25 16:23:49 +02001165To support zip files, a version of C<unzip> needs to be installed that is
1166compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001167
Akron7606afa2016-10-25 16:23:49 +02001168B<The root folder switch using the hash sign is experimental and
1169may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001170
Akronf73ffb62018-06-27 12:13:59 +02001171
Akron63f20d42017-04-10 23:40:29 +02001172=item B<--input-base|-ib> <directory>
1173
1174The base directory for inputs.
1175
1176
Akron941c1a62016-02-23 17:41:41 +01001177=item B<--output|-o> <directory|file>
1178
1179Output folder for archive processing or
1180document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001181writes to C<STDOUT> by default
1182(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001183
1184=item B<--overwrite|-w>
1185
1186Overwrite files that already exist.
1187
Akronf73ffb62018-06-27 12:13:59 +02001188
Akron3741f8b2016-12-21 19:55:21 +01001189=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001190
1191Define the default tokenization by specifying
1192the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001193of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001194This will directly take the file instead of running
1195the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001196
Akron3741f8b2016-12-21 19:55:21 +01001197
1198=item B<--base-sentences|-bs> <foundry>#<layer>
1199
1200Define the layer for base sentences.
1201If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001202Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1203layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001204
1205 Defaults to unset.
1206
1207
1208=item B<--base-paragraphs|-bp> <foundry>#<layer>
1209
1210Define the layer for base paragraphs.
1211If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001212Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1213layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001214
1215 Defaults to unset.
1216
1217
Akron41ac10b2017-02-08 22:47:25 +01001218=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1219
1220Define the layer for base pagebreaks.
1221Currently C<DeReKo#Structure> is the only layer supported.
1222
1223 Defaults to unset.
1224
1225
Akron941c1a62016-02-23 17:41:41 +01001226=item B<--skip|-s> <foundry>[#<layer>]
1227
Akronf7ad89e2016-03-16 18:22:47 +01001228Skip specific annotations by specifying the foundry
1229(and optionally the layer with a C<#>-prefix),
1230e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001231Can be set multiple times.
1232
Akronf73ffb62018-06-27 12:13:59 +02001233
Akronc13a1702016-03-15 19:33:14 +01001234=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001235
Akronf7ad89e2016-03-16 18:22:47 +01001236Convert specific annotations by specifying the foundry
1237(and optionally the layer with a C<#>-prefix),
1238e.g. C<Mate> or C<Mate#Morpho>.
1239Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001240
Akronf73ffb62018-06-27 12:13:59 +02001241
Akroned9baf02019-01-22 17:03:25 +01001242=item B<--non-word-tokens|-nwt>
1243
1244Tokenize non-word tokens like word tokens (defined as matching
1245C</[\d\w]/>). Useful to treat punctuations as tokens.
1246
1247 Defaults to unset.
1248
Akronf1849aa2019-12-16 23:35:33 +01001249
1250=item B<--non-verbal-tokens|-nvt>
1251
1252Tokenize non-verbal tokens marked as in the primary data as
1253the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1254
1255 Defaults to unset.
1256
1257
Akron941c1a62016-02-23 17:41:41 +01001258=item B<--jobs|-j>
1259
1260Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001261for archive processing.
Akron11c80302016-03-18 19:44:43 +01001262Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001263
1264If C<sequential-extraction> is not set to false, this will
1265also apply to extraction.
1266
Akronc11f7982017-02-21 21:20:14 +01001267Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001268times the number of available cores, in case L<Sys::Info>
1269is available.
Akronf7ad89e2016-03-16 18:22:47 +01001270This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001271
Akronf73ffb62018-06-27 12:13:59 +02001272
Akron263274c2019-02-07 09:48:30 +01001273=item B<--koral|-k>
1274
1275Version of the output format. Supported versions are:
1276C<0> for legacy serialization, C<0.03> for serialization
1277with metadata fields as key-values on the root object,
1278C<0.4> for serialization with metadata fields as a list
1279of C<"@type":"koral:field"> objects.
1280
1281Currently defaults to C<0.03>.
1282
1283
Akron9ec88872017-04-12 16:29:06 +02001284=item B<--sequential-extraction|-se>
1285
1286Flag to indicate, if the C<jobs> value also applies to extraction.
1287Some systems may have problems with extracting multiple archives
1288to the same folder at the same time.
1289Can be flagged using C<--no-sequential-extraction> as well.
1290Defaults to C<false>.
1291
Akronf73ffb62018-06-27 12:13:59 +02001292
Akron35db6e32016-03-17 22:42:22 +01001293=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001294
Akron35db6e32016-03-17 22:42:22 +01001295Define the metadata parser to use. Defaults to C<I5>.
1296Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1297This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001298
Akronf73ffb62018-06-27 12:13:59 +02001299
Akron941c1a62016-02-23 17:41:41 +01001300=item B<--gzip|-z>
1301
Akronf7ad89e2016-03-16 18:22:47 +01001302Compress the output.
1303Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001304
Akronf73ffb62018-06-27 12:13:59 +02001305
Akron11c80302016-03-18 19:44:43 +01001306=item B<--cache|-c>
1307
1308File to mmap a cache (using L<Cache::FastMmap>).
1309Defaults to C<korapxml2krill.cache> in the calling directory.
1310
Akronf73ffb62018-06-27 12:13:59 +02001311
Akron11c80302016-03-18 19:44:43 +01001312=item B<--cache-size|-cs>
1313
1314Size of the cache. Defaults to C<50m>.
1315
Akronf73ffb62018-06-27 12:13:59 +02001316
Akron11c80302016-03-18 19:44:43 +01001317=item B<--cache-init|-ci>
1318
1319Initialize cache file.
1320Can be flagged using C<--no-cache-init> as well.
1321Defaults to C<true>.
1322
Akronf73ffb62018-06-27 12:13:59 +02001323
Akron11c80302016-03-18 19:44:43 +01001324=item B<--cache-delete|-cd>
1325
1326Delete cache file after processing.
1327Can be flagged using C<--no-cache-delete> as well.
1328Defaults to C<true>.
1329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akron636aa112017-04-07 18:48:56 +02001331=item B<--config|-cfg>
1332
1333Configure the parameters of your call in a file
1334of key-value pairs with whitespace separator
1335
1336 overwrite 1
1337 token DeReKo#Structure
1338 ...
1339
1340Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001341C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001342C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001343C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001344C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001345C<base-sentences>, C<base-paragraphs>,
1346C<base-pagebreaks>,
1347C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001348(semicolon separated), C<anno> (semicolon separated).
1349
Akronf73ffb62018-06-27 12:13:59 +02001350Configuration parameters will always be overwritten by
1351passed parameters.
1352
1353
Akron81500102017-04-07 20:45:44 +02001354=item B<--temporary-extract|-te>
1355
1356Only valid for the C<archive> command.
1357
1358This will first extract all files into a
1359directory and then will archive.
1360If the directory is given as C<:temp:>,
1361a temporary directory is used.
1362This is especially useful to avoid
1363massive unzipping and potential
1364network latency.
Akron636aa112017-04-07 18:48:56 +02001365
Akronf73ffb62018-06-27 12:13:59 +02001366
Akronc93a0802019-07-11 15:48:34 +02001367=item B<--to-tar>
1368
1369Only valid for the C<archive> command.
1370
1371Writes the output into a tar archive.
1372
1373
Akrone10ad322016-02-27 10:54:26 +01001374=item B<--sigle|-sg>
1375
Akron20807582016-10-26 17:11:34 +02001376Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001377Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001378I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001379Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001380In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001381On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001382
Akron64f7fae2022-07-27 12:45:33 +02001383=item B<--lang>
1384
1385Preferred language for metadata fields. In case multiple titles are
1386given (on any level) with different C<xml:lang> attributes,
1387the language given is preferred.
1388Because titles may have different sources and different priorities,
1389non-specific language titles may still be preferred in case the title
1390source has a higher priority.
1391
Akronf73ffb62018-06-27 12:13:59 +02001392
Akron941c1a62016-02-23 17:41:41 +01001393=item B<--log|-l>
1394
Akronb9c33812020-10-21 16:19:35 +02001395The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001396
Akronf73ffb62018-06-27 12:13:59 +02001397
Akron941c1a62016-02-23 17:41:41 +01001398=item B<--help|-h>
1399
Akron42f48c12020-02-14 13:08:13 +01001400Print help information.
Akron941c1a62016-02-23 17:41:41 +01001401
Akronf73ffb62018-06-27 12:13:59 +02001402
Akron941c1a62016-02-23 17:41:41 +01001403=item B<--version|-v>
1404
1405Print version information.
1406
1407=back
1408
Akronf73ffb62018-06-27 12:13:59 +02001409
Akronc13a1702016-03-15 19:33:14 +01001410=head1 ANNOTATION SUPPORT
1411
1412L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1413developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1414The base foundry with paragraphs, sentences, and the text element are mandatory for
1415L<Krill|https://github.com/KorAP/Krill>.
1416
Akron821db3d2017-04-06 21:19:31 +02001417 Base
1418 #Paragraphs
1419 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001420
Akron821db3d2017-04-06 21:19:31 +02001421 Connexor
1422 #Morpho
1423 #Phrase
1424 #Sentences
1425 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001426
Akron821db3d2017-04-06 21:19:31 +02001427 CoreNLP
1428 #Constituency
1429 #Morpho
1430 #NamedEntities
1431 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001432
Akronce125b62017-06-19 11:54:36 +02001433 CMC
1434 #Morpho
1435
Akron821db3d2017-04-06 21:19:31 +02001436 DeReKo
1437 #Structure
Akronc13a1702016-03-15 19:33:14 +01001438
Akron57510c12019-01-04 14:58:53 +01001439 DGD
1440 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001441 #Structure
Akron57510c12019-01-04 14:58:53 +01001442
Akron821db3d2017-04-06 21:19:31 +02001443 DRuKoLa
1444 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001445
Akron821db3d2017-04-06 21:19:31 +02001446 Glemm
1447 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001448
Akronabb36902021-10-11 15:51:06 +02001449 Gingko
1450 #Morpho
1451
Akronea1aed52018-07-19 14:43:34 +02001452 HNC
1453 #Morpho
1454
Akron4c679192018-01-16 17:41:49 +01001455 LWC
1456 #Dependency
1457
Akron821db3d2017-04-06 21:19:31 +02001458 Malt
1459 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001460
Akron821db3d2017-04-06 21:19:31 +02001461 MarMoT
1462 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001463
Akron821db3d2017-04-06 21:19:31 +02001464 Mate
1465 #Dependency
1466 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001467
Akron821db3d2017-04-06 21:19:31 +02001468 MDParser
1469 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001470
Akrone85a7762022-07-22 08:05:03 +02001471 NKJP
1472 #Morpho
1473 #NamedEntities
1474
Akron821db3d2017-04-06 21:19:31 +02001475 OpenNLP
1476 #Morpho
1477 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001478
Akron07e24772020-04-23 14:00:54 +02001479 RWK
1480 #Morpho
1481 #Structure
1482
Akron821db3d2017-04-06 21:19:31 +02001483 Sgbr
1484 #Lemma
1485 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001486
Akron7d5e6382019-08-08 16:36:27 +02001487 Talismane
1488 #Dependency
1489 #Morpho
1490
Akron821db3d2017-04-06 21:19:31 +02001491 TreeTagger
1492 #Morpho
1493 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001494
Akron821db3d2017-04-06 21:19:31 +02001495 XIP
1496 #Constituency
1497 #Morpho
1498 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001499
Akronc13a1702016-03-15 19:33:14 +01001500
1501More importers are in preparation.
1502New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1503See the built-in annotation importers as examples.
1504
Akronf73ffb62018-06-27 12:13:59 +02001505
Akron41e6c8b2021-10-14 20:22:18 +02001506=head1 METADATA SUPPORT
1507
1508L<KorAP::XML::Krill> has built-in importer for some meta data variants
1509developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1510
1511=over 2
1512
1513=item I5 - Meta data for all I5 files
1514
1515=item Sgbr - Meta data from the Schreibgebrauch project
1516
1517=item Gingko - Meta data from the Gingko project in addition to I5
1518
1519=back
1520
1521More importers are in preparation.
1522New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1523See the built-in meta data importers as examples.
1524
1525
Akron8f69d632020-01-15 16:58:11 +01001526=head1 About KorAP-XML
1527
1528KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1529data model (Bański et al. 2013), where text data are stored physically
1530separated from their interpretations (i.e. annotations).
1531A text document in KorAP-XML therefore consists of several files
1532containing primary data, metadata and annotations.
1533
1534The structure of a single KorAP-XML document can be as follows:
1535
1536 - data.xml
1537 - header.xml
1538 + base
1539 - tokens.xml
1540 - ...
1541 + struct
1542 - structure.xml
1543 - ...
1544 + corenlp
1545 - morpho.xml
1546 - constituency.xml
1547 - ...
1548 + tree_tagger
1549 - morpho.xml
1550 - ...
1551 - ...
1552
1553The C<data.xml> contains the primary data, the C<header.xml> contains
1554the metadata, and the annotation layers are stored in subfolders
1555like C<base>, C<struct> or C<corenlp>
1556(so-called "foundries"; Bański et al. 2013).
1557
1558Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001559(Lüngen and Sperberg-McQueen 2012). See the documentation in
1560L<KorAP::XML::Meta::I5> for translatable fields.
1561
1562Annotations correspond to a variant of the TEI-P5 feature structures
1563(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001564Annotation feature structures refer to character sequences of the primary text
1565inside the C<text> element of the C<data.xml>.
1566A single annotation containing the lemma of a token can have the following structure:
1567
1568 <span from="0" to="3">
1569 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1570 <f name="lex">
1571 <fs>
1572 <f name="lemma">zum</f>
1573 </fs>
1574 </f>
1575 </fs>
1576 </span>
1577
1578The C<from> and C<to> attributes are refering to the character span
1579in the primary text.
1580Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1581the structure may vary. See L<KorAP::XML::Annotation::*> for various
1582annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001583
1584Multiple KorAP-XML documents are organized on three levels following
1585the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1586corpus E<gt> document E<gt> text. On each level metadata information
1587can be stored, that C<korapxml2krill> will merge to a single metadata
1588object per text. A corpus is therefore structured as follows:
1589
1590 + <corpus>
1591 - header.xml
1592 + <document>
1593 - header.xml
1594 + <text>
1595 - data.xml
1596 - header.xml
1597 - ...
1598 - ...
1599
1600A single text can be identified by the concatenation of
1601the corpus identifier, the document identifier and the text identifier.
1602This identifier is called the text sigle
1603(e.g. a text with the identifier C<18486> in the document C<060> in the
1604corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1605
1606These corpora are often stored in zip files, with which C<korapxml2krill>
1607can deal with. Corpora may also be split in multiple zip archives
1608(e.g. one zip file per foundry), which is also supported (see C<--input>).
1609
1610Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1611in form of a test suite.
1612The resulting JSON format merges all annotation layers
1613based on a single token stream.
1614
1615=head2 References
1616
1617Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1618KorAP data model: first approximation, December.
1619
1620Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1621"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1622Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1623L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1624
1625Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1626"Robust corpus architecture: a new look at virtual collections and data access",
1627Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1628L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1629
1630Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1631Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1632"Towards an international standard on featurestructure representation",
1633Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1634pp. 373-376.
1635L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1636
1637Harald Lüngen and C. M. Sperberg-McQueen (2012):
1638"A TEI P5 Document Grammar for the IDS Text Model",
1639Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1640L<PDF|https://journals.openedition.org/jtei/pdf/508>
1641
1642TEI Consortium, eds:
1643"Feature Structures",
1644Guidelines for Electronic Text Encoding and Interchange.
1645L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1646
Akron941c1a62016-02-23 17:41:41 +01001647=head1 AVAILABILITY
1648
1649 https://github.com/KorAP/KorAP-XML-Krill
1650
1651
1652=head1 COPYRIGHT AND LICENSE
1653
Akron9a2545e2022-01-16 15:15:50 +01001654Copyright (C) 2015-2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001655
Akron6882d7d2021-02-08 09:43:57 +01001656Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001657
Akrona76d8352016-10-27 16:27:32 +02001658Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001659
Akron6882d7d2021-02-08 09:43:57 +01001660L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001661Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001662L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001663member of the
Akronf1849aa2019-12-16 23:35:33 +01001664L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001665
1666This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001667L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001668
1669=cut