blob: 9c652ec0c65cbaadf8ae478260d64157ae0cf8ef [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akron941c1a62016-02-23 17:41:41 +0100161# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100162
Akron3c9b27c2022-03-04 13:08:13 +0100163our $LAST_CHANGE = '2022/03/04';
Akron941c1a62016-02-23 17:41:41 +0100164our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100165our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100166our $VERSION_MSG = <<"VERSION";
167Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
168VERSION
169
Akron941c1a62016-02-23 17:41:41 +0100170# Parse comand
171my $cmd;
172our @ARGV;
173if ($ARGV[0] && index($ARGV[0], '-') != 0) {
174 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100175};
Akron63f20d42017-04-10 23:40:29 +0200176my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100177
Akron5f51d422016-08-16 16:26:43 +0200178my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200179
180# Configuration hash
181my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100182
Akron941c1a62016-02-23 17:41:41 +0100183# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000184GetOptions(
Akron08385f62016-03-22 20:37:04 +0100185 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200186 'input-base|ib=s' => \($cfg{input_base}),
187 'output|o=s' => \($cfg{output}),
188 'overwrite|w' => \($cfg{overwrite}),
189 'meta|m=s' => \($cfg{meta}),
190 'token|t=s' => \($cfg{token}),
191 'base-sentences|bs=s' => \($cfg{base_sentences}),
192 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
193 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
194 'gzip|z' => \($cfg{gzip}),
195 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100196 'skip|s=s' => \@skip,
197 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200198 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200199 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200200 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200201 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200202 'primary|p!' => sub {
203 warn 'Primary flag no longer supported!';
204 },
Akron6aed0562020-08-07 16:46:00 +0200205 'pretty|y' => sub {
206 warn 'Pretty flag no longer supported!';
207 },
Akronf8df2162020-08-07 15:03:39 +0200208 'jobs|j=i' => \($cfg{jobs}),
209 'koral|k=f' => \($cfg{koral}),
210 'to-tar' => \($cfg{to_tar}),
211 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
212 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
213 'sequential-extraction|se' => \($cfg{sequential_extraction}),
214 'cache-size|cs=s' => \($cfg{cache_size}),
215 'cache-delete|cd!' => \($cfg{cache_delete}),
216 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100217 'help|h' => sub {
218 pod2usage(
219 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200220 -verbose => 99,
221 -msg => $VERSION_MSG,
222 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100223 );
224 },
225 'version|v' => sub {
226 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200227 -verbose => 0,
228 -msg => $VERSION_MSG,
229 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100230 )
231 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000232);
233
Akrone512b7c2020-08-07 16:16:12 +0200234my %ERROR_HASH = (
235 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
236 -verbose => 99,
237 -msg => $VERSION_MSG,
238 -output => '-',
239 -exit => 1
240);
Akron63f20d42017-04-10 23:40:29 +0200241
Akronf8df2162020-08-07 15:03:39 +0200242# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200243if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200244 my %config;
245
Akronf8df2162020-08-07 15:03:39 +0200246 print "Reading config from $cfg_file\n";
247
Akron636aa112017-04-07 18:48:56 +0200248 Config::Simple->import_from($cfg_file, \%config);
249
Akronf8df2162020-08-07 15:03:39 +0200250 foreach (qw!output cache-size input-base token overwrite
251 meta base-sentences base-paragraphs base-pagebreaks
252 gzip to-tar log cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100253 non-verbal-tokens sequential-extraction
254 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200255 koral extract-dir jobs!) {
256 my $underlined = $_ =~ tr/-/_/r;
257 if (!defined($cfg{$underlined}) && defined $config{$_}) {
258 $cfg{$underlined} = $config{$_};
259 };
Akron636aa112017-04-07 18:48:56 +0200260 };
261
262 # Skip
263 if (!scalar(@skip) && defined $config{'skip'}) {
264 @skip = split /\s*;\s*/, $config{'skip'} ;
265 };
266
267 # Sigle
268 if (!scalar(@sigle) && defined $config{'sigle'}) {
269 @sigle = split /\s*;\s*/, $config{'sigle'} ;
270 };
271
272 # Anno
273 if (!scalar(@anno) && defined $config{'anno'}) {
274 @anno = split /\s*;\s*/, $config{'anno'} ;
275 };
276};
277
Akronf8df2162020-08-07 15:03:39 +0200278# Init variables and set default values
279my $output = $cfg{output};
280my $input_base = $cfg{input_base};
281my $gzip = $cfg{gzip};
282my $to_tar = $cfg{to_tar};
283my $extract_dir = $cfg{extract_dir};
284my $token_base = $cfg{token} // 'OpenNLP#tokens';
285my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
286my $jobs = $cfg{jobs} // 0;
287my $cache_delete = $cfg{cache_delete} // 1;
288my $base_sentences = lc($cfg{base_sentences} // '');
289my $base_paragraphs = lc($cfg{base_paragraphs} // '');
290my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
291my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200292
Akronf8df2162020-08-07 15:03:39 +0200293# Get tokenization basis
294my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200295
Akronf8df2162020-08-07 15:03:39 +0200296# Remove file extension
297$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100298
Akronf8df2162020-08-07 15:03:39 +0200299# Convert sigle to path construct
300s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
301
302my %skip;
303$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200304
Akronb9c33812020-10-21 16:19:35 +0200305Log::Any::Adapter->set(
306 'Stderr', log_level => uc($cfg{log} // 'ERROR')
307);
Akron63f20d42017-04-10 23:40:29 +0200308
Akron84b53ad2022-01-14 12:39:15 +0100309# Start log slimming
310if ($cmd && $cmd eq 'slimlog') {
311 require KorAP::XML::Log::Slim;
312
313 my $log_file = shift @ARGV;
314
315 if (-e $log_file) {
316
317 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
318
319 # Run log filter
320 $slimmer->slim_to;
321 }
322
323 else {
324 warn "Log file can't be found";
325 exit(1);
326 };
327
328 exit;
329};
330
331
Akronf8df2162020-08-07 15:03:39 +0200332if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
333 $log->error("Directory '$output' does not exist.");
334 exit 1;
335};
Akron63f20d42017-04-10 23:40:29 +0200336
Akron941c1a62016-02-23 17:41:41 +0100337# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100338pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000339
Akrone1dbc382016-07-08 22:24:52 +0200340# Gzip has no effect, if no output is given
341pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000342
Akron63f20d42017-04-10 23:40:29 +0200343# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200344if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200345
Akron63f20d42017-04-10 23:40:29 +0200346 # Remove all inputs
347 my $remove_next = 0;
348 @keep_argv = @{c(@keep_argv)->grep(
349 sub {
350 # Input flag
351 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
352 $remove_next = 1;
353 return 0;
354 }
355
356 # input value
357 elsif ($remove_next) {
358 $remove_next = 0;
359 return 0;
360 };
361
362 # Pass parameter
363 return 1;
364 }
365 )->to_array};
366
367
368 # Iterate over all inputs
369 foreach (@input) {
370
Akron081639e2017-04-21 19:01:39 +0200371 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200372 my $new_out = catdir($output, get_file_name_from_glob($_));
373
Akron486f9ab2017-04-22 23:25:19 +0200374 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200375 unless ($to_tar) {
376 if (make_path($new_out) == 0 && !-d $new_out) {
377 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200378 exit 1;
Akron081639e2017-04-21 19:01:39 +0200379 };
Akron63f20d42017-04-10 23:40:29 +0200380 };
381
382 # Create archive command
383 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
384 print "Start serial processing of $_ to $new_out\n";
385
386 # Start archiving
387 system @archive_cmd;
388 };
389
Akron3abc03e2017-06-29 16:23:35 +0200390 exit;
Akron63f20d42017-04-10 23:40:29 +0200391};
392
Akron5c602cb2020-08-07 17:00:52 +0200393# Define supported (and preinstalled) transformation modules
394my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100395push(@layers, ['Base', 'Sentences']) unless $base_sentences;
396push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200397
398# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200399push(@layers, ['Connexor', 'Morpho'],
400 ['Connexor', 'Syntax'],
401 ['Connexor', 'Phrase'],
402 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200403
404# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200405push(@layers,
406 ['CoreNLP', 'NamedEntities'],
407 ['CoreNLP', 'Sentences'],
408 ['CoreNLP', 'Morpho'],
409 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200410
Akronce125b62017-06-19 11:54:36 +0200411# CMC
412push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100413
Akrone1dbc382016-07-08 22:24:52 +0200414# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100415my @dereko_attr = ();
416if ($base_sentences eq 'dereko#structure') {
417 push @dereko_attr, 'sentences';
418};
419if ($base_paragraphs eq 'dereko#structure') {
420 push @dereko_attr, 'paragraphs';
421};
Akron636bd9c2017-02-09 17:13:00 +0100422
Akron41ac10b2017-02-08 22:47:25 +0100423if ($base_pagebreaks eq 'dereko#structure') {
424 push @dereko_attr, 'pagebreaks';
425};
426
427if ($dereko_attr[0]) {
428 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100429}
430else {
431 push(@layers, ['DeReKo', 'Structure']);
432};
Akrone1dbc382016-07-08 22:24:52 +0200433
Akron57510c12019-01-04 14:58:53 +0100434# DGD
435push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100436if ($base_sentences eq 'dgd#structure') {
437 push(@layers, ['DGD', 'Structure', 'base-sentence']);
438}
Akron57510c12019-01-04 14:58:53 +0100439
440# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200441push(@layers,
442 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100443
Akronabb36902021-10-11 15:51:06 +0200444# Gingko
445push(@layers,
446 ['Gingko', 'Morpho']);
447
Akrone1dbc382016-07-08 22:24:52 +0200448# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200449push(@layers,
450 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200451
Akronea1aed52018-07-19 14:43:34 +0200452# HNC
Akron5c602cb2020-08-07 17:00:52 +0200453push(@layers,
454 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200455
Akron4c679192018-01-16 17:41:49 +0100456# LWC
Akron5c602cb2020-08-07 17:00:52 +0200457push(@layers,
458 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100459
Akrone1dbc382016-07-08 22:24:52 +0200460# Malt
Akron5c602cb2020-08-07 17:00:52 +0200461push(@layers,
462 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200463
Akron57510c12019-01-04 14:58:53 +0100464# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200465push(@layers,
466 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200467
468# Mate
Akron5c602cb2020-08-07 17:00:52 +0200469push(@layers,
470 ['Mate', 'Morpho'],
471 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200472
Akron57510c12019-01-04 14:58:53 +0100473# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200474push(@layers,
475 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100476
Akron88d063a2022-03-21 15:10:01 +0100477# NKJP
478push(@layers,
479 ['NKJP', 'Morpho']);
480
Akrone1dbc382016-07-08 22:24:52 +0200481# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200482push(@layers,
483 ['OpenNLP', 'Morpho'],
484 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200485
Akron07e24772020-04-23 14:00:54 +0200486# Redewiedergabe
487push(@layers, ['RWK', 'Morpho']);
488if ($base_sentences eq 'rwk#structure') {
489 push(@layers, ['RWK', 'Structure']);
490};
491
Akrone1dbc382016-07-08 22:24:52 +0200492# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200493push(@layers,
494 ['Sgbr', 'Lemma'],
495 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200496
Akron7d5e6382019-08-08 16:36:27 +0200497# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200498push(@layers,
499 ['Talismane', 'Dependency'],
500 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200501
Akrone1dbc382016-07-08 22:24:52 +0200502# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200503push(@layers,
504 ['TreeTagger', 'Morpho'],
505 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200506
507# XIP
Akron5c602cb2020-08-07 17:00:52 +0200508push(@layers,
509 ['XIP', 'Morpho'],
510 ['XIP', 'Constituency'],
511 ['XIP', 'Sentences'],
512 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200513
Akron4fa37c32017-01-20 14:43:10 +0100514
Akrone1dbc382016-07-08 22:24:52 +0200515# Check filters
516my @filtered_anno;
517if ($skip{'#all'}) {
518 foreach (@anno) {
519 push @filtered_anno, [ split('#', $_) ];
520 };
521}
522
523# Add all annotations that are not skipped
524else {
525 # Add to index file - respect skipping
526 foreach my $info (@layers) {
527 # Skip if Foundry or Foundry#Layer should be skipped
528 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
529 push @filtered_anno, $info;
530 };
531 };
532};
533
Akrone1dbc382016-07-08 22:24:52 +0200534
535# TODO: This should not be initialized for batch
536my $cache = Cache::FastMmap->new(
537 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200538 cache_size => ($cfg{cache_size} // '50m'),
539 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200540);
541
Akron03b24db2016-08-16 20:54:32 +0200542# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200543my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200544 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200545 meta_type => $cfg{meta},
546 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200547 foundry => $token_base_foundry,
548 layer => $token_base_layer,
549 gzip => $gzip,
550 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200551 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100552 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200553 non_word_tokens => ($cfg{non_word_tokens} // 0),
554 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200555);
556
Akrone512b7c2020-08-07 16:16:12 +0200557
558# Auto adjust jobs
559if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100560 my $cores = 1;
561 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
562 $cores = Sys::Info->new->device('CPU')->count;
563 }
564 else {
565 $log->warn("Unable to determine number of cores");
566 };
567
Akrone512b7c2020-08-07 16:16:12 +0200568 $jobs = ceil(5 * $cores);
569 $log->info("Run using $jobs jobs on $cores cores");
570};
571
572
Akron63f20d42017-04-10 23:40:29 +0200573# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200574if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200575
Akron821db3d2017-04-06 21:19:31 +0200576 my @new_input = ();
577
578 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200579 foreach my $wild_card (@input) {
580
581 # Prefix with input root
582 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
583
584 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200585 };
586
Akron63f20d42017-04-10 23:40:29 +0200587 # Sort files by length
588 @input = sort { length($a) <=> length($b) } @new_input;
589
590 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200591};
592
593
Akron941c1a62016-02-23 17:41:41 +0100594# Process a single file
595unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100596 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000597
Akron941c1a62016-02-23 17:41:41 +0100598 BEGIN {
599 $main::TIME = Benchmark->new;
600 $main::LAST_STOP = Benchmark->new;
601 };
602
603 sub stop_time {
604 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200605 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100606 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200607 timestr(timediff($new, $main::LAST_STOP)) .
608 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
609 );
Akron941c1a62016-02-23 17:41:41 +0100610 $main::LAST_STOP = $new;
611 };
612
613 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200614 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100615
Akron7d4cdd82016-08-17 21:39:45 +0200616 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200617 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100618
Akron11c80302016-03-18 19:44:43 +0100619 # Delete cache file
620 unlink($cache_file) if $cache_delete;
621
Akron5f51d422016-08-16 16:26:43 +0200622 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200623 exit;
Akron81500102017-04-07 20:45:44 +0200624};
625
Nils Diewald59094f22014-11-05 18:20:50 +0000626
Akrone10ad322016-02-27 10:54:26 +0100627# Extract XML files
Akron81500102017-04-07 20:45:44 +0200628if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100629
Akrond5643ad2017-07-04 20:27:13 +0200630 # Output is required
631 pod2usage(%ERROR_HASH) unless $output;
632
Akron7d4cdd82016-08-17 21:39:45 +0200633 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200634 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100635
Akron7d4cdd82016-08-17 21:39:45 +0200636 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100637 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200638 $log->error("Unzip is not installed or incompatible.");
639 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100640 };
641
Akronb0c88db2016-06-29 16:33:18 +0200642 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200643 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200644
Akron31a08cb2019-02-20 20:43:26 +0100645 # Will set @sigle
646 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200647
Akrone10ad322016-02-27 10:54:26 +0100648 # Iterate over all given sigles and extract
649 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100650
Akron2812ba22016-10-28 21:55:59 +0200651 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200652
Akron03b24db2016-08-16 20:54:32 +0200653 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200654 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100655
Akron955b75b2019-02-21 14:28:41 +0100656 # TODO:
657 # - prefix???
658 $archive->extract_sigle([$_], $output, $jobs)
659 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200660 );
Akrone10ad322016-02-27 10:54:26 +0100661 print "extracted.\n";
662 };
Akronb0c88db2016-06-29 16:33:18 +0200663 }
Akron7d4cdd82016-08-17 21:39:45 +0200664
665 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200666 else {
667 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200668 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100669 };
670}
671
Akron81500102017-04-07 20:45:44 +0200672
Akron941c1a62016-02-23 17:41:41 +0100673# Process an archive
674elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000675
Akron81500102017-04-07 20:45:44 +0200676 my $archive_output;
677
678 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100679 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200680
681 # Create new archive object
682 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
683
684 # Check zip capabilities
685 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200686 $log->error("Unzip is not installed or incompatible.");
687 exit 1;
Akron81500102017-04-07 20:45:44 +0200688 };
689
690 # Add further annotation archived
691 $archive->attach($_) foreach @input[1..$#input];
692
693 # Create a temporary directory
694 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200695 $extract_dir = tempdir(CLEANUP => 0);
696 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200697 };
698
Akron63f20d42017-04-10 23:40:29 +0200699 # Add some random extra to avoid clashes with multiple archives
700 $extract_dir = catdir($extract_dir, random_string('cccccc'));
701
Akron31a08cb2019-02-20 20:43:26 +0100702 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200703 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200704 @input = ($extract_dir);
705 }
706 else {
707 $log->error('Unable to extract from primary archive ' . $input[0] .
708 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200709 exit 1;
Akron81500102017-04-07 20:45:44 +0200710 };
711 }
712
713 # Can't create archive object
714 else {
715 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200716 exit 1;
Akron81500102017-04-07 20:45:44 +0200717 };
718 };
719
Akron7d4cdd82016-08-17 21:39:45 +0200720 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100721 my $pool = Parallel::ForkManager->new($jobs);
722
Akron7d4cdd82016-08-17 21:39:45 +0200723 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100724 my $iter = 1; # Current text in process
725
Akronda3097e2017-04-23 19:53:57 +0200726 my $tar_archive;
727 my $output_dir = $output;
728 my $tar_fh;
729
730 # Initialize tar archive
731 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200732
733 # Set output name
734 my $tar_file = $output;
735 unless ($tar_file =~ /\.tar$/) {
736 $tar_file .= '.tar';
737 };
738
739 # Initiate the tar file
740 print "Writing to file $tar_file\n";
741 $tar_fh = IO::File->new($tar_file, 'w');
742 $tar_fh->binmode(1);
743
Akroneb370a02022-02-24 13:33:40 +0100744 # Use tar builder for archiving
745 if (eval("use Archive::Tar::Builder; 1;")) {
746 $tar_archive = Archive::Tar::Builder->new(
747 ignore_errors => 1
748 );
749
750 # Set handle
751 $tar_archive->set_handle($tar_fh);
752 }
753
754 # Fallback solution
755 else {
756 $tar_archive = KorAP::XML::TarBuilder->new(
757 $tar_fh
758 );
759 };
Akronda3097e2017-04-23 19:53:57 +0200760
761 # Output to temporary directory
762 $output_dir = File::Temp->newdir;
763 };
764
Akron941c1a62016-02-23 17:41:41 +0100765 # Report on fork message
766 $pool->run_on_finish (
767 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200768 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100769 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200770
Akron08385f62016-03-22 20:37:04 +0100771 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200772 ($iter++) . "/$count]" .
773 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200774 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200775
776 if (!$code && $to_tar && $data->[2]) {
777 my $filename = $data->[2];
778
779 # Lock filehandle
780 if (flock($tar_fh, LOCK_EX)) {
781
Akron9a062ce2017-07-04 19:12:05 +0200782 my $clean_file = fileparse($filename);
783
Akronda3097e2017-04-23 19:53:57 +0200784 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200785 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200786 unlink $filename;
787
788 # Unlock filehandle
789 flock($tar_fh, LOCK_UN);
790 }
791 else {
792 $log->warn("Unable to add $filename to archive");
793 };
794 };
795
Akron4c0cf312016-10-15 16:42:09 +0200796 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100797 }
798 );
799
800 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200801 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100802 print "Reading data ...\n";
803
Akron7d4cdd82016-08-17 21:39:45 +0200804 # unless (Cache::FastMmap->new(
805 # share_file => $cache_file,
806 # cache_size => $cache_size,
807 # init_file => $cache_init
808 # )) {
809 # print "Unable to intialize cache '$cache_file'\n\n";
810 # exit(1);
811 # };
Akron11c80302016-03-18 19:44:43 +0100812
Akron486f9ab2017-04-22 23:25:19 +0200813
Akron941c1a62016-02-23 17:41:41 +0100814 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100815 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200816 # TODO:
817 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100818 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100819 my @dirs;
820 my $dir;
821
Akron7d4cdd82016-08-17 21:39:45 +0200822 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100823 while (1) {
824 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200825 push @dirs, $dir;
826 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100827 };
828 last unless $it->next;
829 };
830
831 print "Start processing ...\n";
832 $t = Benchmark->new;
833 $count = scalar @dirs;
834
835 DIRECTORY_LOOP:
836 for (my $i = 0; $i < $count; $i++) {
837
Akrone1dbc382016-07-08 22:24:52 +0200838 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200839 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200840 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200841 );
Akron941c1a62016-02-23 17:41:41 +0100842
843 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200844 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200845
Akron13d56622016-10-31 14:54:49 +0100846 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200847 $pool->finish(
848 0,
Akronda3097e2017-04-23 19:53:57 +0200849 [
850 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
851 undef,
852 $filename
853 ]
Akron486f9ab2017-04-22 23:25:19 +0200854 );
Akron3ec48972016-08-17 23:24:52 +0200855 }
856 else {
Akron4c0cf312016-10-15 16:42:09 +0200857 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200858 };
Akron941c1a62016-02-23 17:41:41 +0100859 };
860 }
861
862 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200863 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200864
Akron941c1a62016-02-23 17:41:41 +0100865 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200866 $log->error("Unzip is not installed or incompatible.");
867 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100868 };
869
Akron08385f62016-03-22 20:37:04 +0100870 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200871 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100872
Akron31a08cb2019-02-20 20:43:26 +0100873 # Get sigles to extract
874 my $prefix = set_sigle($archive);
875
Akron941c1a62016-02-23 17:41:41 +0100876 print "Start processing ...\n";
877 $t = Benchmark->new;
878 my @dirs = $archive->list_texts;
879 $count = scalar @dirs;
880
881 ARCHIVE_LOOP:
882 for (my $i = 0; $i < $count; $i++) {
883
884 # Split path information
885 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
886
Akrone1dbc382016-07-08 22:24:52 +0200887 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200888 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200889 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200890 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200891 catfile($corpus, $doc, $text)
892 . '.json' . ($gzip ? '.gz' : '')
893 )
Akrone1dbc382016-07-08 22:24:52 +0200894 );
Akron941c1a62016-02-23 17:41:41 +0100895
896 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200897 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100898
Akron4c0cf312016-10-15 16:42:09 +0200899 # Create temporary file
900 $temp = File::Temp->newdir;
901
Akronbdf434a2016-10-24 17:42:07 +0200902 # TODO: Check if $filename exist at the beginning,
903 # because extraction can be horrible slow!
904
Akron941c1a62016-02-23 17:41:41 +0100905 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100906 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100907
Akron7d4cdd82016-08-17 21:39:45 +0200908 # Create corpus directory
909 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100910
Akron7d4cdd82016-08-17 21:39:45 +0200911 # Temporary directory
912 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100913
Akron7d4cdd82016-08-17 21:39:45 +0200914 # Write file
Akron13d56622016-10-31 14:54:49 +0100915 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200916
Akron4c0cf312016-10-15 16:42:09 +0200917 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100918 $pool->finish(
919 0,
Akronda3097e2017-04-23 19:53:57 +0200920 [
921 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
922 $temp,
923 $filename
924 ]
Akron13d56622016-10-31 14:54:49 +0100925 );
Akron7d4cdd82016-08-17 21:39:45 +0200926 }
927 else {
Akron4c0cf312016-10-15 16:42:09 +0200928 # Delete temporary file
929 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200930 };
Akron941c1a62016-02-23 17:41:41 +0100931 }
Akron7d4cdd82016-08-17 21:39:45 +0200932
933 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100934 else {
Akron4c0cf312016-10-15 16:42:09 +0200935 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100936 };
937 };
938 }
939
940 else {
941 print "Input is neither a directory nor an archive.\n\n";
942 };
943
944 $pool->wait_all_children;
945
Akron11c80302016-03-18 19:44:43 +0100946 # Delete cache file
947 unlink($cache_file) if $cache_delete;
948
Akronda3097e2017-04-23 19:53:57 +0200949 # Close tar filehandle
950 if ($to_tar && $tar_fh) {
951 $tar_archive->finish;
952 $tar_fh->close;
953 print "Wrote to tar archive.\n";
954 };
955
Akron63f20d42017-04-10 23:40:29 +0200956 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100957 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200958};
Akron941c1a62016-02-23 17:41:41 +0100959
Nils Diewald2db9ad02013-10-29 19:26:43 +0000960
Akron31a08cb2019-02-20 20:43:26 +0100961# For an archive, this will create the list
962# of all sigles to process
963sub set_sigle {
964 my $archive = shift;
965
966 my $prefix = 1;
967 my @dirs = ();
968
969 # No sigles given
970 unless (@sigle) {
971
972 # Get files
973 foreach ($archive->list_texts) {
974
975 push @dirs, $_;
976
977 # Split path information
978 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
979
980 # TODO: Make this OS independent
981 push @sigle, join '/', $corpus, $doc, $text;
982 };
983 }
984
985 # Check sigle for doc sigles
986 else {
987 my @new_sigle;
988
989 my $prefix_check = 0;
990
991 # Iterate over all sigle
992 foreach (@sigle) {
993
994 # Sigle is a doc sigle
995 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
996
997 print "$_ ...";
998 # Check if a prefix is needed
999 unless ($prefix_check) {
1000
1001 if ($prefix = $archive->check_prefix) {
1002 print " with prefix ...";
1003 };
1004 $prefix_check = 1;
1005 };
1006
1007 print "\n";
1008
Akron31a08cb2019-02-20 20:43:26 +01001009 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001010 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1011 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001012 );
1013 print "extracted.\n";
1014 }
1015
1016 # Sigle is a text sigle
1017 else {
1018 push @new_sigle, $_;
1019
1020 unless ($prefix_check) {
1021
1022 if ($prefix = $archive->check_prefix) {
1023 print " with prefix ...";
1024 };
1025 $prefix_check = 1;
1026 };
1027 };
1028 };
1029 @sigle = @new_sigle;
1030 };
1031
1032 return $prefix;
1033};
1034
1035
Akron63f20d42017-04-10 23:40:29 +02001036# Cleanup temporary extraction directory
1037if ($extract_dir) {
1038 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001039 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001040};
1041
1042
1043print "\n";
1044
Nils Diewald2db9ad02013-10-29 19:26:43 +00001045__END__
Akron941c1a62016-02-23 17:41:41 +01001046
1047=pod
1048
1049=encoding utf8
1050
1051=head1 NAME
1052
Akron42f48c12020-02-14 13:08:13 +01001053korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001054
1055
1056=head1 SYNOPSIS
1057
Akrona76d8352016-10-27 16:27:32 +02001058 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001059
Akron2fd402b2016-10-27 21:26:48 +02001060
Akron941c1a62016-02-23 17:41:41 +01001061=head1 DESCRIPTION
1062
1063L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1064compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001065The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001066
1067
1068=head1 INSTALLATION
1069
1070The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1071
Akronaf386982016-10-12 00:33:25 +02001072 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001073
Akronc13a1702016-03-15 19:33:14 +01001074In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001075be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001076Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akroneb370a02022-02-24 13:33:40 +01001077Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1078Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001079In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001080
1081=head1 ARGUMENTS
1082
Akrona76d8352016-10-27 16:27:32 +02001083 $ korapxml2krill -z --input <directory> --output <filename>
1084
1085Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001086It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001087
Akron941c1a62016-02-23 17:41:41 +01001088=over 2
1089
1090=item B<archive>
1091
Akron081639e2017-04-21 19:01:39 +02001092 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001093
Akron2fd402b2016-10-27 21:26:48 +02001094Converts an archive of KorAP-XML documents. It expects a directory
1095(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001096
1097=item B<extract>
1098
Akrona76d8352016-10-27 16:27:32 +02001099 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1100
1101Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001102
Akron63f20d42017-04-10 23:40:29 +02001103=item B<serial>
1104
1105 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1106
1107Convert archives sequentially. The inputs are not merged but treated
1108as they are (so they may be premerged or globs).
1109the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001110are created based on the archive name. In case the C<--to-tar> flag is given,
1111the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001112
1113
Akron84b53ad2022-01-14 12:39:15 +01001114=item B<slimlog>
1115
1116 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1117
1118Filters out all useless aka succesfull information from logs, to simplify
1119log checks. Expects no further options.
1120
1121
Akron941c1a62016-02-23 17:41:41 +01001122=back
1123
1124
1125=head1 OPTIONS
1126
1127=over 2
1128
Akrona76d8352016-10-27 16:27:32 +02001129=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001130
Akrona76d8352016-10-27 16:27:32 +02001131Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001132
Akron7606afa2016-10-25 16:23:49 +02001133Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001134document, while C<archive> expects a KorAP-XML corpus folder or a zip
1135file to batch process multiple files.
1136C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001137
Akrona76d8352016-10-27 16:27:32 +02001138C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001139that the first archive listed contains all primary data files
1140and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001141
Akron7606afa2016-10-25 16:23:49 +02001142 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001143
Akron821db3d2017-04-06 21:19:31 +02001144Input may also be defined using BSD glob wildcards.
1145
1146 -i 'file/news*.zip'
1147
1148The extended input array will be sorted in length order, so the shortest
1149path needs to contain all primary data files and all meta data files.
1150
Akron0c3e3752016-06-28 15:55:53 +02001151(The directory structure follows the base directory format,
1152that may include a C<.> root folder.
1153In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001154need to be passed with a hash sign in front of the archive's name.
1155This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001156
Akron7606afa2016-10-25 16:23:49 +02001157To support zip files, a version of C<unzip> needs to be installed that is
1158compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001159
Akron7606afa2016-10-25 16:23:49 +02001160B<The root folder switch using the hash sign is experimental and
1161may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001162
Akronf73ffb62018-06-27 12:13:59 +02001163
Akron63f20d42017-04-10 23:40:29 +02001164=item B<--input-base|-ib> <directory>
1165
1166The base directory for inputs.
1167
1168
Akron941c1a62016-02-23 17:41:41 +01001169=item B<--output|-o> <directory|file>
1170
1171Output folder for archive processing or
1172document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001173writes to C<STDOUT> by default
1174(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001175
1176=item B<--overwrite|-w>
1177
1178Overwrite files that already exist.
1179
Akronf73ffb62018-06-27 12:13:59 +02001180
Akron3741f8b2016-12-21 19:55:21 +01001181=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001182
1183Define the default tokenization by specifying
1184the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001185of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001186This will directly take the file instead of running
1187the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001188
Akron3741f8b2016-12-21 19:55:21 +01001189
1190=item B<--base-sentences|-bs> <foundry>#<layer>
1191
1192Define the layer for base sentences.
1193If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001194Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1195layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001196
1197 Defaults to unset.
1198
1199
1200=item B<--base-paragraphs|-bp> <foundry>#<layer>
1201
1202Define the layer for base paragraphs.
1203If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001204Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1205layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001206
1207 Defaults to unset.
1208
1209
Akron41ac10b2017-02-08 22:47:25 +01001210=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1211
1212Define the layer for base pagebreaks.
1213Currently C<DeReKo#Structure> is the only layer supported.
1214
1215 Defaults to unset.
1216
1217
Akron941c1a62016-02-23 17:41:41 +01001218=item B<--skip|-s> <foundry>[#<layer>]
1219
Akronf7ad89e2016-03-16 18:22:47 +01001220Skip specific annotations by specifying the foundry
1221(and optionally the layer with a C<#>-prefix),
1222e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001223Can be set multiple times.
1224
Akronf73ffb62018-06-27 12:13:59 +02001225
Akronc13a1702016-03-15 19:33:14 +01001226=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001227
Akronf7ad89e2016-03-16 18:22:47 +01001228Convert specific annotations by specifying the foundry
1229(and optionally the layer with a C<#>-prefix),
1230e.g. C<Mate> or C<Mate#Morpho>.
1231Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001232
Akronf73ffb62018-06-27 12:13:59 +02001233
Akroned9baf02019-01-22 17:03:25 +01001234=item B<--non-word-tokens|-nwt>
1235
1236Tokenize non-word tokens like word tokens (defined as matching
1237C</[\d\w]/>). Useful to treat punctuations as tokens.
1238
1239 Defaults to unset.
1240
Akronf1849aa2019-12-16 23:35:33 +01001241
1242=item B<--non-verbal-tokens|-nvt>
1243
1244Tokenize non-verbal tokens marked as in the primary data as
1245the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1246
1247 Defaults to unset.
1248
1249
Akron941c1a62016-02-23 17:41:41 +01001250=item B<--jobs|-j>
1251
1252Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001253for archive processing.
Akron11c80302016-03-18 19:44:43 +01001254Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001255
1256If C<sequential-extraction> is not set to false, this will
1257also apply to extraction.
1258
Akronc11f7982017-02-21 21:20:14 +01001259Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001260times the number of available cores, in case L<Sys::Info>
1261is available.
Akronf7ad89e2016-03-16 18:22:47 +01001262This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001263
Akronf73ffb62018-06-27 12:13:59 +02001264
Akron263274c2019-02-07 09:48:30 +01001265=item B<--koral|-k>
1266
1267Version of the output format. Supported versions are:
1268C<0> for legacy serialization, C<0.03> for serialization
1269with metadata fields as key-values on the root object,
1270C<0.4> for serialization with metadata fields as a list
1271of C<"@type":"koral:field"> objects.
1272
1273Currently defaults to C<0.03>.
1274
1275
Akron9ec88872017-04-12 16:29:06 +02001276=item B<--sequential-extraction|-se>
1277
1278Flag to indicate, if the C<jobs> value also applies to extraction.
1279Some systems may have problems with extracting multiple archives
1280to the same folder at the same time.
1281Can be flagged using C<--no-sequential-extraction> as well.
1282Defaults to C<false>.
1283
Akronf73ffb62018-06-27 12:13:59 +02001284
Akron35db6e32016-03-17 22:42:22 +01001285=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001286
Akron35db6e32016-03-17 22:42:22 +01001287Define the metadata parser to use. Defaults to C<I5>.
1288Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1289This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001290
Akronf73ffb62018-06-27 12:13:59 +02001291
Akron941c1a62016-02-23 17:41:41 +01001292=item B<--gzip|-z>
1293
Akronf7ad89e2016-03-16 18:22:47 +01001294Compress the output.
1295Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001296
Akronf73ffb62018-06-27 12:13:59 +02001297
Akron11c80302016-03-18 19:44:43 +01001298=item B<--cache|-c>
1299
1300File to mmap a cache (using L<Cache::FastMmap>).
1301Defaults to C<korapxml2krill.cache> in the calling directory.
1302
Akronf73ffb62018-06-27 12:13:59 +02001303
Akron11c80302016-03-18 19:44:43 +01001304=item B<--cache-size|-cs>
1305
1306Size of the cache. Defaults to C<50m>.
1307
Akronf73ffb62018-06-27 12:13:59 +02001308
Akron11c80302016-03-18 19:44:43 +01001309=item B<--cache-init|-ci>
1310
1311Initialize cache file.
1312Can be flagged using C<--no-cache-init> as well.
1313Defaults to C<true>.
1314
Akronf73ffb62018-06-27 12:13:59 +02001315
Akron11c80302016-03-18 19:44:43 +01001316=item B<--cache-delete|-cd>
1317
1318Delete cache file after processing.
1319Can be flagged using C<--no-cache-delete> as well.
1320Defaults to C<true>.
1321
Akronf73ffb62018-06-27 12:13:59 +02001322
Akron636aa112017-04-07 18:48:56 +02001323=item B<--config|-cfg>
1324
1325Configure the parameters of your call in a file
1326of key-value pairs with whitespace separator
1327
1328 overwrite 1
1329 token DeReKo#Structure
1330 ...
1331
1332Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001333C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001334C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001335C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001336C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001337C<base-sentences>, C<base-paragraphs>,
1338C<base-pagebreaks>,
1339C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001340(semicolon separated), C<anno> (semicolon separated).
1341
Akronf73ffb62018-06-27 12:13:59 +02001342Configuration parameters will always be overwritten by
1343passed parameters.
1344
1345
Akron81500102017-04-07 20:45:44 +02001346=item B<--temporary-extract|-te>
1347
1348Only valid for the C<archive> command.
1349
1350This will first extract all files into a
1351directory and then will archive.
1352If the directory is given as C<:temp:>,
1353a temporary directory is used.
1354This is especially useful to avoid
1355massive unzipping and potential
1356network latency.
Akron636aa112017-04-07 18:48:56 +02001357
Akronf73ffb62018-06-27 12:13:59 +02001358
Akronc93a0802019-07-11 15:48:34 +02001359=item B<--to-tar>
1360
1361Only valid for the C<archive> command.
1362
1363Writes the output into a tar archive.
1364
1365
Akrone10ad322016-02-27 10:54:26 +01001366=item B<--sigle|-sg>
1367
Akron20807582016-10-26 17:11:34 +02001368Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001369Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001370I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001371Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001372In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001373On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001374
Akronf73ffb62018-06-27 12:13:59 +02001375
Akron941c1a62016-02-23 17:41:41 +01001376=item B<--log|-l>
1377
Akronb9c33812020-10-21 16:19:35 +02001378The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001379
Akronf73ffb62018-06-27 12:13:59 +02001380
Akron941c1a62016-02-23 17:41:41 +01001381=item B<--help|-h>
1382
Akron42f48c12020-02-14 13:08:13 +01001383Print help information.
Akron941c1a62016-02-23 17:41:41 +01001384
Akronf73ffb62018-06-27 12:13:59 +02001385
Akron941c1a62016-02-23 17:41:41 +01001386=item B<--version|-v>
1387
1388Print version information.
1389
1390=back
1391
Akronf73ffb62018-06-27 12:13:59 +02001392
Akronc13a1702016-03-15 19:33:14 +01001393=head1 ANNOTATION SUPPORT
1394
1395L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1396developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1397The base foundry with paragraphs, sentences, and the text element are mandatory for
1398L<Krill|https://github.com/KorAP/Krill>.
1399
Akron821db3d2017-04-06 21:19:31 +02001400 Base
1401 #Paragraphs
1402 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001403
Akron821db3d2017-04-06 21:19:31 +02001404 Connexor
1405 #Morpho
1406 #Phrase
1407 #Sentences
1408 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001409
Akron821db3d2017-04-06 21:19:31 +02001410 CoreNLP
1411 #Constituency
1412 #Morpho
1413 #NamedEntities
1414 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001415
Akronce125b62017-06-19 11:54:36 +02001416 CMC
1417 #Morpho
1418
Akron821db3d2017-04-06 21:19:31 +02001419 DeReKo
1420 #Structure
Akronc13a1702016-03-15 19:33:14 +01001421
Akron57510c12019-01-04 14:58:53 +01001422 DGD
1423 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001424 #Structure
Akron57510c12019-01-04 14:58:53 +01001425
Akron821db3d2017-04-06 21:19:31 +02001426 DRuKoLa
1427 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001428
Akron821db3d2017-04-06 21:19:31 +02001429 Glemm
1430 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001431
Akronabb36902021-10-11 15:51:06 +02001432 Gingko
1433 #Morpho
1434
Akronea1aed52018-07-19 14:43:34 +02001435 HNC
1436 #Morpho
1437
Akron4c679192018-01-16 17:41:49 +01001438 LWC
1439 #Dependency
1440
Akron821db3d2017-04-06 21:19:31 +02001441 Malt
1442 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001443
Akron821db3d2017-04-06 21:19:31 +02001444 MarMoT
1445 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001446
Akron821db3d2017-04-06 21:19:31 +02001447 Mate
1448 #Dependency
1449 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001450
Akron821db3d2017-04-06 21:19:31 +02001451 MDParser
1452 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001453
Akron821db3d2017-04-06 21:19:31 +02001454 OpenNLP
1455 #Morpho
1456 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001457
Akron07e24772020-04-23 14:00:54 +02001458 RWK
1459 #Morpho
1460 #Structure
1461
Akron821db3d2017-04-06 21:19:31 +02001462 Sgbr
1463 #Lemma
1464 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001465
Akron7d5e6382019-08-08 16:36:27 +02001466 Talismane
1467 #Dependency
1468 #Morpho
1469
Akron821db3d2017-04-06 21:19:31 +02001470 TreeTagger
1471 #Morpho
1472 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001473
Akron821db3d2017-04-06 21:19:31 +02001474 XIP
1475 #Constituency
1476 #Morpho
1477 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001478
Akronc13a1702016-03-15 19:33:14 +01001479
1480More importers are in preparation.
1481New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1482See the built-in annotation importers as examples.
1483
Akronf73ffb62018-06-27 12:13:59 +02001484
Akron41e6c8b2021-10-14 20:22:18 +02001485=head1 METADATA SUPPORT
1486
1487L<KorAP::XML::Krill> has built-in importer for some meta data variants
1488developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1489
1490=over 2
1491
1492=item I5 - Meta data for all I5 files
1493
1494=item Sgbr - Meta data from the Schreibgebrauch project
1495
1496=item Gingko - Meta data from the Gingko project in addition to I5
1497
1498=back
1499
1500More importers are in preparation.
1501New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1502See the built-in meta data importers as examples.
1503
1504
Akron8f69d632020-01-15 16:58:11 +01001505=head1 About KorAP-XML
1506
1507KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1508data model (Bański et al. 2013), where text data are stored physically
1509separated from their interpretations (i.e. annotations).
1510A text document in KorAP-XML therefore consists of several files
1511containing primary data, metadata and annotations.
1512
1513The structure of a single KorAP-XML document can be as follows:
1514
1515 - data.xml
1516 - header.xml
1517 + base
1518 - tokens.xml
1519 - ...
1520 + struct
1521 - structure.xml
1522 - ...
1523 + corenlp
1524 - morpho.xml
1525 - constituency.xml
1526 - ...
1527 + tree_tagger
1528 - morpho.xml
1529 - ...
1530 - ...
1531
1532The C<data.xml> contains the primary data, the C<header.xml> contains
1533the metadata, and the annotation layers are stored in subfolders
1534like C<base>, C<struct> or C<corenlp>
1535(so-called "foundries"; Bański et al. 2013).
1536
1537Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001538(Lüngen and Sperberg-McQueen 2012). See the documentation in
1539L<KorAP::XML::Meta::I5> for translatable fields.
1540
1541Annotations correspond to a variant of the TEI-P5 feature structures
1542(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001543Annotation feature structures refer to character sequences of the primary text
1544inside the C<text> element of the C<data.xml>.
1545A single annotation containing the lemma of a token can have the following structure:
1546
1547 <span from="0" to="3">
1548 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1549 <f name="lex">
1550 <fs>
1551 <f name="lemma">zum</f>
1552 </fs>
1553 </f>
1554 </fs>
1555 </span>
1556
1557The C<from> and C<to> attributes are refering to the character span
1558in the primary text.
1559Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1560the structure may vary. See L<KorAP::XML::Annotation::*> for various
1561annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001562
1563Multiple KorAP-XML documents are organized on three levels following
1564the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1565corpus E<gt> document E<gt> text. On each level metadata information
1566can be stored, that C<korapxml2krill> will merge to a single metadata
1567object per text. A corpus is therefore structured as follows:
1568
1569 + <corpus>
1570 - header.xml
1571 + <document>
1572 - header.xml
1573 + <text>
1574 - data.xml
1575 - header.xml
1576 - ...
1577 - ...
1578
1579A single text can be identified by the concatenation of
1580the corpus identifier, the document identifier and the text identifier.
1581This identifier is called the text sigle
1582(e.g. a text with the identifier C<18486> in the document C<060> in the
1583corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1584
1585These corpora are often stored in zip files, with which C<korapxml2krill>
1586can deal with. Corpora may also be split in multiple zip archives
1587(e.g. one zip file per foundry), which is also supported (see C<--input>).
1588
1589Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1590in form of a test suite.
1591The resulting JSON format merges all annotation layers
1592based on a single token stream.
1593
1594=head2 References
1595
1596Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1597KorAP data model: first approximation, December.
1598
1599Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1600"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1601Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1602L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1603
1604Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1605"Robust corpus architecture: a new look at virtual collections and data access",
1606Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1607L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1608
1609Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1610Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1611"Towards an international standard on featurestructure representation",
1612Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1613pp. 373-376.
1614L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1615
1616Harald Lüngen and C. M. Sperberg-McQueen (2012):
1617"A TEI P5 Document Grammar for the IDS Text Model",
1618Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1619L<PDF|https://journals.openedition.org/jtei/pdf/508>
1620
1621TEI Consortium, eds:
1622"Feature Structures",
1623Guidelines for Electronic Text Encoding and Interchange.
1624L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1625
Akron941c1a62016-02-23 17:41:41 +01001626=head1 AVAILABILITY
1627
1628 https://github.com/KorAP/KorAP-XML-Krill
1629
1630
1631=head1 COPYRIGHT AND LICENSE
1632
Akron9a2545e2022-01-16 15:15:50 +01001633Copyright (C) 2015-2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001634
Akron6882d7d2021-02-08 09:43:57 +01001635Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001636
Akrona76d8352016-10-27 16:27:32 +02001637Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001638
Akron6882d7d2021-02-08 09:43:57 +01001639L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001640Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001641L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001642member of the
Akronf1849aa2019-12-16 23:35:33 +01001643L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001644
1645This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001646L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001647
1648=cut