blob: a4a8276f9975bcf121adc1250d339b7f90fc3de8 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akron941c1a62016-02-23 17:41:41 +0100161# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100162
Akroneb370a02022-02-24 13:33:40 +0100163our $LAST_CHANGE = '2022/02/24';
Akron941c1a62016-02-23 17:41:41 +0100164our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100165our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100166our $VERSION_MSG = <<"VERSION";
167Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
168VERSION
169
Akron941c1a62016-02-23 17:41:41 +0100170# Parse comand
171my $cmd;
172our @ARGV;
173if ($ARGV[0] && index($ARGV[0], '-') != 0) {
174 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100175};
Akron63f20d42017-04-10 23:40:29 +0200176my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100177
Akron5f51d422016-08-16 16:26:43 +0200178my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200179
180# Configuration hash
181my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100182
Akron941c1a62016-02-23 17:41:41 +0100183# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000184GetOptions(
Akron08385f62016-03-22 20:37:04 +0100185 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200186 'input-base|ib=s' => \($cfg{input_base}),
187 'output|o=s' => \($cfg{output}),
188 'overwrite|w' => \($cfg{overwrite}),
189 'meta|m=s' => \($cfg{meta}),
190 'token|t=s' => \($cfg{token}),
191 'base-sentences|bs=s' => \($cfg{base_sentences}),
192 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
193 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
194 'gzip|z' => \($cfg{gzip}),
195 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100196 'skip|s=s' => \@skip,
197 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200198 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200199 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200200 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200201 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200202 'primary|p!' => sub {
203 warn 'Primary flag no longer supported!';
204 },
Akron6aed0562020-08-07 16:46:00 +0200205 'pretty|y' => sub {
206 warn 'Pretty flag no longer supported!';
207 },
Akronf8df2162020-08-07 15:03:39 +0200208 'jobs|j=i' => \($cfg{jobs}),
209 'koral|k=f' => \($cfg{koral}),
210 'to-tar' => \($cfg{to_tar}),
211 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
212 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
213 'sequential-extraction|se' => \($cfg{sequential_extraction}),
214 'cache-size|cs=s' => \($cfg{cache_size}),
215 'cache-delete|cd!' => \($cfg{cache_delete}),
216 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100217 'help|h' => sub {
218 pod2usage(
219 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200220 -verbose => 99,
221 -msg => $VERSION_MSG,
222 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100223 );
224 },
225 'version|v' => sub {
226 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200227 -verbose => 0,
228 -msg => $VERSION_MSG,
229 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100230 )
231 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000232);
233
Akrone512b7c2020-08-07 16:16:12 +0200234my %ERROR_HASH = (
235 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
236 -verbose => 99,
237 -msg => $VERSION_MSG,
238 -output => '-',
239 -exit => 1
240);
Akron63f20d42017-04-10 23:40:29 +0200241
Akronf8df2162020-08-07 15:03:39 +0200242# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200243if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200244 my %config;
245
Akronf8df2162020-08-07 15:03:39 +0200246 print "Reading config from $cfg_file\n";
247
Akron636aa112017-04-07 18:48:56 +0200248 Config::Simple->import_from($cfg_file, \%config);
249
Akronf8df2162020-08-07 15:03:39 +0200250 foreach (qw!output cache-size input-base token overwrite
251 meta base-sentences base-paragraphs base-pagebreaks
252 gzip to-tar log cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100253 non-verbal-tokens sequential-extraction
254 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200255 koral extract-dir jobs!) {
256 my $underlined = $_ =~ tr/-/_/r;
257 if (!defined($cfg{$underlined}) && defined $config{$_}) {
258 $cfg{$underlined} = $config{$_};
259 };
Akron636aa112017-04-07 18:48:56 +0200260 };
261
262 # Skip
263 if (!scalar(@skip) && defined $config{'skip'}) {
264 @skip = split /\s*;\s*/, $config{'skip'} ;
265 };
266
267 # Sigle
268 if (!scalar(@sigle) && defined $config{'sigle'}) {
269 @sigle = split /\s*;\s*/, $config{'sigle'} ;
270 };
271
272 # Anno
273 if (!scalar(@anno) && defined $config{'anno'}) {
274 @anno = split /\s*;\s*/, $config{'anno'} ;
275 };
276};
277
Akronf8df2162020-08-07 15:03:39 +0200278# Init variables and set default values
279my $output = $cfg{output};
280my $input_base = $cfg{input_base};
281my $gzip = $cfg{gzip};
282my $to_tar = $cfg{to_tar};
283my $extract_dir = $cfg{extract_dir};
284my $token_base = $cfg{token} // 'OpenNLP#tokens';
285my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
286my $jobs = $cfg{jobs} // 0;
287my $cache_delete = $cfg{cache_delete} // 1;
288my $base_sentences = lc($cfg{base_sentences} // '');
289my $base_paragraphs = lc($cfg{base_paragraphs} // '');
290my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
291my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200292
Akronf8df2162020-08-07 15:03:39 +0200293# Get tokenization basis
294my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200295
Akronf8df2162020-08-07 15:03:39 +0200296# Remove file extension
297$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100298
Akronf8df2162020-08-07 15:03:39 +0200299# Convert sigle to path construct
300s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
301
302my %skip;
303$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200304
Akronb9c33812020-10-21 16:19:35 +0200305Log::Any::Adapter->set(
306 'Stderr', log_level => uc($cfg{log} // 'ERROR')
307);
Akron63f20d42017-04-10 23:40:29 +0200308
Akron84b53ad2022-01-14 12:39:15 +0100309# Start log slimming
310if ($cmd && $cmd eq 'slimlog') {
311 require KorAP::XML::Log::Slim;
312
313 my $log_file = shift @ARGV;
314
315 if (-e $log_file) {
316
317 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
318
319 # Run log filter
320 $slimmer->slim_to;
321 }
322
323 else {
324 warn "Log file can't be found";
325 exit(1);
326 };
327
328 exit;
329};
330
331
Akronf8df2162020-08-07 15:03:39 +0200332if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
333 $log->error("Directory '$output' does not exist.");
334 exit 1;
335};
Akron63f20d42017-04-10 23:40:29 +0200336
Akron941c1a62016-02-23 17:41:41 +0100337# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100338pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000339
Akrone1dbc382016-07-08 22:24:52 +0200340# Gzip has no effect, if no output is given
341pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000342
Akron63f20d42017-04-10 23:40:29 +0200343# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200344if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200345
Akron63f20d42017-04-10 23:40:29 +0200346 # Remove all inputs
347 my $remove_next = 0;
348 @keep_argv = @{c(@keep_argv)->grep(
349 sub {
350 # Input flag
351 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
352 $remove_next = 1;
353 return 0;
354 }
355
356 # input value
357 elsif ($remove_next) {
358 $remove_next = 0;
359 return 0;
360 };
361
362 # Pass parameter
363 return 1;
364 }
365 )->to_array};
366
367
368 # Iterate over all inputs
369 foreach (@input) {
370
Akron081639e2017-04-21 19:01:39 +0200371 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200372 my $new_out = catdir($output, get_file_name_from_glob($_));
373
Akron486f9ab2017-04-22 23:25:19 +0200374 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200375 unless ($to_tar) {
376 if (make_path($new_out) == 0 && !-d $new_out) {
377 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200378 exit 1;
Akron081639e2017-04-21 19:01:39 +0200379 };
Akron63f20d42017-04-10 23:40:29 +0200380 };
381
382 # Create archive command
383 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
384 print "Start serial processing of $_ to $new_out\n";
385
386 # Start archiving
387 system @archive_cmd;
388 };
389
Akron3abc03e2017-06-29 16:23:35 +0200390 exit;
Akron63f20d42017-04-10 23:40:29 +0200391};
392
Akron5c602cb2020-08-07 17:00:52 +0200393# Define supported (and preinstalled) transformation modules
394my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100395push(@layers, ['Base', 'Sentences']) unless $base_sentences;
396push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200397
398# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200399push(@layers, ['Connexor', 'Morpho'],
400 ['Connexor', 'Syntax'],
401 ['Connexor', 'Phrase'],
402 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200403
404# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200405push(@layers,
406 ['CoreNLP', 'NamedEntities'],
407 ['CoreNLP', 'Sentences'],
408 ['CoreNLP', 'Morpho'],
409 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200410
Akronce125b62017-06-19 11:54:36 +0200411# CMC
412push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100413
Akrone1dbc382016-07-08 22:24:52 +0200414# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100415my @dereko_attr = ();
416if ($base_sentences eq 'dereko#structure') {
417 push @dereko_attr, 'sentences';
418};
419if ($base_paragraphs eq 'dereko#structure') {
420 push @dereko_attr, 'paragraphs';
421};
Akron636bd9c2017-02-09 17:13:00 +0100422
Akron41ac10b2017-02-08 22:47:25 +0100423if ($base_pagebreaks eq 'dereko#structure') {
424 push @dereko_attr, 'pagebreaks';
425};
426
427if ($dereko_attr[0]) {
428 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100429}
430else {
431 push(@layers, ['DeReKo', 'Structure']);
432};
Akrone1dbc382016-07-08 22:24:52 +0200433
Akron57510c12019-01-04 14:58:53 +0100434# DGD
435push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100436if ($base_sentences eq 'dgd#structure') {
437 push(@layers, ['DGD', 'Structure', 'base-sentence']);
438}
Akron57510c12019-01-04 14:58:53 +0100439
440# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200441push(@layers,
442 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100443
Akronabb36902021-10-11 15:51:06 +0200444# Gingko
445push(@layers,
446 ['Gingko', 'Morpho']);
447
Akrone1dbc382016-07-08 22:24:52 +0200448# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200449push(@layers,
450 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200451
Akronea1aed52018-07-19 14:43:34 +0200452# HNC
Akron5c602cb2020-08-07 17:00:52 +0200453push(@layers,
454 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200455
Akron4c679192018-01-16 17:41:49 +0100456# LWC
Akron5c602cb2020-08-07 17:00:52 +0200457push(@layers,
458 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100459
Akrone1dbc382016-07-08 22:24:52 +0200460# Malt
Akron5c602cb2020-08-07 17:00:52 +0200461push(@layers,
462 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200463
Akron57510c12019-01-04 14:58:53 +0100464# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200465push(@layers,
466 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200467
468# Mate
Akron5c602cb2020-08-07 17:00:52 +0200469push(@layers,
470 ['Mate', 'Morpho'],
471 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200472
Akron57510c12019-01-04 14:58:53 +0100473# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200474push(@layers,
475 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100476
Akrone1dbc382016-07-08 22:24:52 +0200477# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200478push(@layers,
479 ['OpenNLP', 'Morpho'],
480 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200481
Akron07e24772020-04-23 14:00:54 +0200482# Redewiedergabe
483push(@layers, ['RWK', 'Morpho']);
484if ($base_sentences eq 'rwk#structure') {
485 push(@layers, ['RWK', 'Structure']);
486};
487
Akrone1dbc382016-07-08 22:24:52 +0200488# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200489push(@layers,
490 ['Sgbr', 'Lemma'],
491 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200492
Akron7d5e6382019-08-08 16:36:27 +0200493# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200494push(@layers,
495 ['Talismane', 'Dependency'],
496 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200497
Akrone1dbc382016-07-08 22:24:52 +0200498# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200499push(@layers,
500 ['TreeTagger', 'Morpho'],
501 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200502
503# XIP
Akron5c602cb2020-08-07 17:00:52 +0200504push(@layers,
505 ['XIP', 'Morpho'],
506 ['XIP', 'Constituency'],
507 ['XIP', 'Sentences'],
508 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200509
Akron4fa37c32017-01-20 14:43:10 +0100510
Akrone1dbc382016-07-08 22:24:52 +0200511# Check filters
512my @filtered_anno;
513if ($skip{'#all'}) {
514 foreach (@anno) {
515 push @filtered_anno, [ split('#', $_) ];
516 };
517}
518
519# Add all annotations that are not skipped
520else {
521 # Add to index file - respect skipping
522 foreach my $info (@layers) {
523 # Skip if Foundry or Foundry#Layer should be skipped
524 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
525 push @filtered_anno, $info;
526 };
527 };
528};
529
Akrone1dbc382016-07-08 22:24:52 +0200530
531# TODO: This should not be initialized for batch
532my $cache = Cache::FastMmap->new(
533 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200534 cache_size => ($cfg{cache_size} // '50m'),
535 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200536);
537
Akron03b24db2016-08-16 20:54:32 +0200538# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200539my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200540 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200541 meta_type => $cfg{meta},
542 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200543 foundry => $token_base_foundry,
544 layer => $token_base_layer,
545 gzip => $gzip,
546 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200547 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100548 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200549 non_word_tokens => ($cfg{non_word_tokens} // 0),
550 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200551);
552
Akrone512b7c2020-08-07 16:16:12 +0200553
554# Auto adjust jobs
555if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100556 my $cores = 1;
557 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
558 $cores = Sys::Info->new->device('CPU')->count;
559 }
560 else {
561 $log->warn("Unable to determine number of cores");
562 };
563
Akrone512b7c2020-08-07 16:16:12 +0200564 $jobs = ceil(5 * $cores);
565 $log->info("Run using $jobs jobs on $cores cores");
566};
567
568
Akron63f20d42017-04-10 23:40:29 +0200569# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200570if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200571
Akron821db3d2017-04-06 21:19:31 +0200572 my @new_input = ();
573
574 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200575 foreach my $wild_card (@input) {
576
577 # Prefix with input root
578 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
579
580 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200581 };
582
Akron63f20d42017-04-10 23:40:29 +0200583 # Sort files by length
584 @input = sort { length($a) <=> length($b) } @new_input;
585
586 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200587};
588
589
Akron941c1a62016-02-23 17:41:41 +0100590# Process a single file
591unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100592 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000593
Akron941c1a62016-02-23 17:41:41 +0100594 BEGIN {
595 $main::TIME = Benchmark->new;
596 $main::LAST_STOP = Benchmark->new;
597 };
598
599 sub stop_time {
600 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200601 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100602 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200603 timestr(timediff($new, $main::LAST_STOP)) .
604 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
605 );
Akron941c1a62016-02-23 17:41:41 +0100606 $main::LAST_STOP = $new;
607 };
608
609 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200610 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100611
Akron7d4cdd82016-08-17 21:39:45 +0200612 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200613 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100614
Akron11c80302016-03-18 19:44:43 +0100615 # Delete cache file
616 unlink($cache_file) if $cache_delete;
617
Akron5f51d422016-08-16 16:26:43 +0200618 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200619 exit;
Akron81500102017-04-07 20:45:44 +0200620};
621
Nils Diewald59094f22014-11-05 18:20:50 +0000622
Akrone10ad322016-02-27 10:54:26 +0100623# Extract XML files
Akron81500102017-04-07 20:45:44 +0200624if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100625
Akrond5643ad2017-07-04 20:27:13 +0200626 # Output is required
627 pod2usage(%ERROR_HASH) unless $output;
628
Akron7d4cdd82016-08-17 21:39:45 +0200629 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200630 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100631
Akron7d4cdd82016-08-17 21:39:45 +0200632 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100633 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200634 $log->error("Unzip is not installed or incompatible.");
635 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100636 };
637
Akronb0c88db2016-06-29 16:33:18 +0200638 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200639 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200640
Akron31a08cb2019-02-20 20:43:26 +0100641 # Will set @sigle
642 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200643
Akrone10ad322016-02-27 10:54:26 +0100644 # Iterate over all given sigles and extract
645 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100646
Akron2812ba22016-10-28 21:55:59 +0200647 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200648
Akron03b24db2016-08-16 20:54:32 +0200649 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200650 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100651
Akron955b75b2019-02-21 14:28:41 +0100652 # TODO:
653 # - prefix???
654 $archive->extract_sigle([$_], $output, $jobs)
655 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200656 );
Akrone10ad322016-02-27 10:54:26 +0100657 print "extracted.\n";
658 };
Akronb0c88db2016-06-29 16:33:18 +0200659 }
Akron7d4cdd82016-08-17 21:39:45 +0200660
661 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200662 else {
663 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200664 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100665 };
666}
667
Akron81500102017-04-07 20:45:44 +0200668
Akron941c1a62016-02-23 17:41:41 +0100669# Process an archive
670elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000671
Akron81500102017-04-07 20:45:44 +0200672 my $archive_output;
673
674 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100675 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200676
677 # Create new archive object
678 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
679
680 # Check zip capabilities
681 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200682 $log->error("Unzip is not installed or incompatible.");
683 exit 1;
Akron81500102017-04-07 20:45:44 +0200684 };
685
686 # Add further annotation archived
687 $archive->attach($_) foreach @input[1..$#input];
688
689 # Create a temporary directory
690 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200691 $extract_dir = tempdir(CLEANUP => 0);
692 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200693 };
694
Akron63f20d42017-04-10 23:40:29 +0200695 # Add some random extra to avoid clashes with multiple archives
696 $extract_dir = catdir($extract_dir, random_string('cccccc'));
697
Akron31a08cb2019-02-20 20:43:26 +0100698 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200699 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200700 @input = ($extract_dir);
701 }
702 else {
703 $log->error('Unable to extract from primary archive ' . $input[0] .
704 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200705 exit 1;
Akron81500102017-04-07 20:45:44 +0200706 };
707 }
708
709 # Can't create archive object
710 else {
711 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200712 exit 1;
Akron81500102017-04-07 20:45:44 +0200713 };
714 };
715
Akron7d4cdd82016-08-17 21:39:45 +0200716 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100717 my $pool = Parallel::ForkManager->new($jobs);
718
Akron7d4cdd82016-08-17 21:39:45 +0200719 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100720 my $iter = 1; # Current text in process
721
Akronda3097e2017-04-23 19:53:57 +0200722 my $tar_archive;
723 my $output_dir = $output;
724 my $tar_fh;
725
726 # Initialize tar archive
727 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200728
729 # Set output name
730 my $tar_file = $output;
731 unless ($tar_file =~ /\.tar$/) {
732 $tar_file .= '.tar';
733 };
734
735 # Initiate the tar file
736 print "Writing to file $tar_file\n";
737 $tar_fh = IO::File->new($tar_file, 'w');
738 $tar_fh->binmode(1);
739
Akroneb370a02022-02-24 13:33:40 +0100740 # Use tar builder for archiving
741 if (eval("use Archive::Tar::Builder; 1;")) {
742 $tar_archive = Archive::Tar::Builder->new(
743 ignore_errors => 1
744 );
745
746 # Set handle
747 $tar_archive->set_handle($tar_fh);
748 }
749
750 # Fallback solution
751 else {
752 $tar_archive = KorAP::XML::TarBuilder->new(
753 $tar_fh
754 );
755 };
Akronda3097e2017-04-23 19:53:57 +0200756
757 # Output to temporary directory
758 $output_dir = File::Temp->newdir;
759 };
760
Akron941c1a62016-02-23 17:41:41 +0100761 # Report on fork message
762 $pool->run_on_finish (
763 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200764 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100765 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200766
Akron08385f62016-03-22 20:37:04 +0100767 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200768 ($iter++) . "/$count]" .
769 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200770 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200771
772 if (!$code && $to_tar && $data->[2]) {
773 my $filename = $data->[2];
774
775 # Lock filehandle
776 if (flock($tar_fh, LOCK_EX)) {
777
Akron9a062ce2017-07-04 19:12:05 +0200778 my $clean_file = fileparse($filename);
779
Akronda3097e2017-04-23 19:53:57 +0200780 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200781 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200782 unlink $filename;
783
784 # Unlock filehandle
785 flock($tar_fh, LOCK_UN);
786 }
787 else {
788 $log->warn("Unable to add $filename to archive");
789 };
790 };
791
Akron4c0cf312016-10-15 16:42:09 +0200792 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100793 }
794 );
795
796 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200797 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100798 print "Reading data ...\n";
799
Akron7d4cdd82016-08-17 21:39:45 +0200800 # unless (Cache::FastMmap->new(
801 # share_file => $cache_file,
802 # cache_size => $cache_size,
803 # init_file => $cache_init
804 # )) {
805 # print "Unable to intialize cache '$cache_file'\n\n";
806 # exit(1);
807 # };
Akron11c80302016-03-18 19:44:43 +0100808
Akron486f9ab2017-04-22 23:25:19 +0200809
Akron941c1a62016-02-23 17:41:41 +0100810 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100811 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200812 # TODO:
813 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100814 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100815 my @dirs;
816 my $dir;
817
Akron7d4cdd82016-08-17 21:39:45 +0200818 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100819 while (1) {
820 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200821 push @dirs, $dir;
822 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100823 };
824 last unless $it->next;
825 };
826
827 print "Start processing ...\n";
828 $t = Benchmark->new;
829 $count = scalar @dirs;
830
831 DIRECTORY_LOOP:
832 for (my $i = 0; $i < $count; $i++) {
833
Akrone1dbc382016-07-08 22:24:52 +0200834 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200835 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200836 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200837 );
Akron941c1a62016-02-23 17:41:41 +0100838
839 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200840 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200841
Akron13d56622016-10-31 14:54:49 +0100842 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200843 $pool->finish(
844 0,
Akronda3097e2017-04-23 19:53:57 +0200845 [
846 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
847 undef,
848 $filename
849 ]
Akron486f9ab2017-04-22 23:25:19 +0200850 );
Akron3ec48972016-08-17 23:24:52 +0200851 }
852 else {
Akron4c0cf312016-10-15 16:42:09 +0200853 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200854 };
Akron941c1a62016-02-23 17:41:41 +0100855 };
856 }
857
858 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200859 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200860
Akron941c1a62016-02-23 17:41:41 +0100861 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200862 $log->error("Unzip is not installed or incompatible.");
863 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100864 };
865
Akron08385f62016-03-22 20:37:04 +0100866 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200867 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100868
Akron31a08cb2019-02-20 20:43:26 +0100869 # Get sigles to extract
870 my $prefix = set_sigle($archive);
871
Akron941c1a62016-02-23 17:41:41 +0100872 print "Start processing ...\n";
873 $t = Benchmark->new;
874 my @dirs = $archive->list_texts;
875 $count = scalar @dirs;
876
877 ARCHIVE_LOOP:
878 for (my $i = 0; $i < $count; $i++) {
879
880 # Split path information
881 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
882
Akrone1dbc382016-07-08 22:24:52 +0200883 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200884 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200885 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200886 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200887 catfile($corpus, $doc, $text)
888 . '.json' . ($gzip ? '.gz' : '')
889 )
Akrone1dbc382016-07-08 22:24:52 +0200890 );
Akron941c1a62016-02-23 17:41:41 +0100891
892 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200893 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100894
Akron4c0cf312016-10-15 16:42:09 +0200895 # Create temporary file
896 $temp = File::Temp->newdir;
897
Akronbdf434a2016-10-24 17:42:07 +0200898 # TODO: Check if $filename exist at the beginning,
899 # because extraction can be horrible slow!
900
Akron941c1a62016-02-23 17:41:41 +0100901 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100902 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100903
Akron7d4cdd82016-08-17 21:39:45 +0200904 # Create corpus directory
905 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100906
Akron7d4cdd82016-08-17 21:39:45 +0200907 # Temporary directory
908 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100909
Akron7d4cdd82016-08-17 21:39:45 +0200910 # Write file
Akron13d56622016-10-31 14:54:49 +0100911 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200912
Akron4c0cf312016-10-15 16:42:09 +0200913 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100914 $pool->finish(
915 0,
Akronda3097e2017-04-23 19:53:57 +0200916 [
917 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
918 $temp,
919 $filename
920 ]
Akron13d56622016-10-31 14:54:49 +0100921 );
Akron7d4cdd82016-08-17 21:39:45 +0200922 }
923 else {
Akron4c0cf312016-10-15 16:42:09 +0200924 # Delete temporary file
925 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200926 };
Akron941c1a62016-02-23 17:41:41 +0100927 }
Akron7d4cdd82016-08-17 21:39:45 +0200928
929 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100930 else {
Akron4c0cf312016-10-15 16:42:09 +0200931 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100932 };
933 };
934 }
935
936 else {
937 print "Input is neither a directory nor an archive.\n\n";
938 };
939
940 $pool->wait_all_children;
941
Akron11c80302016-03-18 19:44:43 +0100942 # Delete cache file
943 unlink($cache_file) if $cache_delete;
944
Akronda3097e2017-04-23 19:53:57 +0200945 # Close tar filehandle
946 if ($to_tar && $tar_fh) {
947 $tar_archive->finish;
948 $tar_fh->close;
949 print "Wrote to tar archive.\n";
950 };
951
Akron63f20d42017-04-10 23:40:29 +0200952 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100953 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200954};
Akron941c1a62016-02-23 17:41:41 +0100955
Nils Diewald2db9ad02013-10-29 19:26:43 +0000956
Akron31a08cb2019-02-20 20:43:26 +0100957# For an archive, this will create the list
958# of all sigles to process
959sub set_sigle {
960 my $archive = shift;
961
962 my $prefix = 1;
963 my @dirs = ();
964
965 # No sigles given
966 unless (@sigle) {
967
968 # Get files
969 foreach ($archive->list_texts) {
970
971 push @dirs, $_;
972
973 # Split path information
974 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
975
976 # TODO: Make this OS independent
977 push @sigle, join '/', $corpus, $doc, $text;
978 };
979 }
980
981 # Check sigle for doc sigles
982 else {
983 my @new_sigle;
984
985 my $prefix_check = 0;
986
987 # Iterate over all sigle
988 foreach (@sigle) {
989
990 # Sigle is a doc sigle
991 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
992
993 print "$_ ...";
994 # Check if a prefix is needed
995 unless ($prefix_check) {
996
997 if ($prefix = $archive->check_prefix) {
998 print " with prefix ...";
999 };
1000 $prefix_check = 1;
1001 };
1002
1003 print "\n";
1004
Akron31a08cb2019-02-20 20:43:26 +01001005 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001006 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1007 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001008 );
1009 print "extracted.\n";
1010 }
1011
1012 # Sigle is a text sigle
1013 else {
1014 push @new_sigle, $_;
1015
1016 unless ($prefix_check) {
1017
1018 if ($prefix = $archive->check_prefix) {
1019 print " with prefix ...";
1020 };
1021 $prefix_check = 1;
1022 };
1023 };
1024 };
1025 @sigle = @new_sigle;
1026 };
1027
1028 return $prefix;
1029};
1030
1031
Akron63f20d42017-04-10 23:40:29 +02001032# Cleanup temporary extraction directory
1033if ($extract_dir) {
1034 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001035 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001036};
1037
1038
1039print "\n";
1040
Nils Diewald2db9ad02013-10-29 19:26:43 +00001041__END__
Akron941c1a62016-02-23 17:41:41 +01001042
1043=pod
1044
1045=encoding utf8
1046
1047=head1 NAME
1048
Akron42f48c12020-02-14 13:08:13 +01001049korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001050
1051
1052=head1 SYNOPSIS
1053
Akrona76d8352016-10-27 16:27:32 +02001054 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001055
Akron2fd402b2016-10-27 21:26:48 +02001056
Akron941c1a62016-02-23 17:41:41 +01001057=head1 DESCRIPTION
1058
1059L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1060compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001061The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001062
1063
1064=head1 INSTALLATION
1065
1066The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1067
Akronaf386982016-10-12 00:33:25 +02001068 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001069
Akronc13a1702016-03-15 19:33:14 +01001070In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001071be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001072Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akroneb370a02022-02-24 13:33:40 +01001073Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1074Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001075In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001076
1077=head1 ARGUMENTS
1078
Akrona76d8352016-10-27 16:27:32 +02001079 $ korapxml2krill -z --input <directory> --output <filename>
1080
1081Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001082It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001083
Akron941c1a62016-02-23 17:41:41 +01001084=over 2
1085
1086=item B<archive>
1087
Akron081639e2017-04-21 19:01:39 +02001088 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001089
Akron2fd402b2016-10-27 21:26:48 +02001090Converts an archive of KorAP-XML documents. It expects a directory
1091(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001092
1093=item B<extract>
1094
Akrona76d8352016-10-27 16:27:32 +02001095 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1096
1097Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001098
Akron63f20d42017-04-10 23:40:29 +02001099=item B<serial>
1100
1101 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1102
1103Convert archives sequentially. The inputs are not merged but treated
1104as they are (so they may be premerged or globs).
1105the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001106are created based on the archive name. In case the C<--to-tar> flag is given,
1107the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001108
1109
Akron84b53ad2022-01-14 12:39:15 +01001110=item B<slimlog>
1111
1112 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1113
1114Filters out all useless aka succesfull information from logs, to simplify
1115log checks. Expects no further options.
1116
1117
Akron941c1a62016-02-23 17:41:41 +01001118=back
1119
1120
1121=head1 OPTIONS
1122
1123=over 2
1124
Akrona76d8352016-10-27 16:27:32 +02001125=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001126
Akrona76d8352016-10-27 16:27:32 +02001127Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001128
Akron7606afa2016-10-25 16:23:49 +02001129Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001130document, while C<archive> expects a KorAP-XML corpus folder or a zip
1131file to batch process multiple files.
1132C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001133
Akrona76d8352016-10-27 16:27:32 +02001134C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001135that the first archive listed contains all primary data files
1136and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001137
Akron7606afa2016-10-25 16:23:49 +02001138 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001139
Akron821db3d2017-04-06 21:19:31 +02001140Input may also be defined using BSD glob wildcards.
1141
1142 -i 'file/news*.zip'
1143
1144The extended input array will be sorted in length order, so the shortest
1145path needs to contain all primary data files and all meta data files.
1146
Akron0c3e3752016-06-28 15:55:53 +02001147(The directory structure follows the base directory format,
1148that may include a C<.> root folder.
1149In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001150need to be passed with a hash sign in front of the archive's name.
1151This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001152
Akron7606afa2016-10-25 16:23:49 +02001153To support zip files, a version of C<unzip> needs to be installed that is
1154compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001155
Akron7606afa2016-10-25 16:23:49 +02001156B<The root folder switch using the hash sign is experimental and
1157may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001158
Akronf73ffb62018-06-27 12:13:59 +02001159
Akron63f20d42017-04-10 23:40:29 +02001160=item B<--input-base|-ib> <directory>
1161
1162The base directory for inputs.
1163
1164
Akron941c1a62016-02-23 17:41:41 +01001165=item B<--output|-o> <directory|file>
1166
1167Output folder for archive processing or
1168document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001169writes to C<STDOUT> by default
1170(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001171
1172=item B<--overwrite|-w>
1173
1174Overwrite files that already exist.
1175
Akronf73ffb62018-06-27 12:13:59 +02001176
Akron3741f8b2016-12-21 19:55:21 +01001177=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001178
1179Define the default tokenization by specifying
1180the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001181of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001182This will directly take the file instead of running
1183the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001184
Akron3741f8b2016-12-21 19:55:21 +01001185
1186=item B<--base-sentences|-bs> <foundry>#<layer>
1187
1188Define the layer for base sentences.
1189If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001190Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1191layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001192
1193 Defaults to unset.
1194
1195
1196=item B<--base-paragraphs|-bp> <foundry>#<layer>
1197
1198Define the layer for base paragraphs.
1199If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001200Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1201layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001202
1203 Defaults to unset.
1204
1205
Akron41ac10b2017-02-08 22:47:25 +01001206=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1207
1208Define the layer for base pagebreaks.
1209Currently C<DeReKo#Structure> is the only layer supported.
1210
1211 Defaults to unset.
1212
1213
Akron941c1a62016-02-23 17:41:41 +01001214=item B<--skip|-s> <foundry>[#<layer>]
1215
Akronf7ad89e2016-03-16 18:22:47 +01001216Skip specific annotations by specifying the foundry
1217(and optionally the layer with a C<#>-prefix),
1218e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001219Can be set multiple times.
1220
Akronf73ffb62018-06-27 12:13:59 +02001221
Akronc13a1702016-03-15 19:33:14 +01001222=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001223
Akronf7ad89e2016-03-16 18:22:47 +01001224Convert specific annotations by specifying the foundry
1225(and optionally the layer with a C<#>-prefix),
1226e.g. C<Mate> or C<Mate#Morpho>.
1227Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001228
Akronf73ffb62018-06-27 12:13:59 +02001229
Akroned9baf02019-01-22 17:03:25 +01001230=item B<--non-word-tokens|-nwt>
1231
1232Tokenize non-word tokens like word tokens (defined as matching
1233C</[\d\w]/>). Useful to treat punctuations as tokens.
1234
1235 Defaults to unset.
1236
Akronf1849aa2019-12-16 23:35:33 +01001237
1238=item B<--non-verbal-tokens|-nvt>
1239
1240Tokenize non-verbal tokens marked as in the primary data as
1241the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1242
1243 Defaults to unset.
1244
1245
Akron941c1a62016-02-23 17:41:41 +01001246=item B<--jobs|-j>
1247
1248Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001249for archive processing.
Akron11c80302016-03-18 19:44:43 +01001250Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001251
1252If C<sequential-extraction> is not set to false, this will
1253also apply to extraction.
1254
Akronc11f7982017-02-21 21:20:14 +01001255Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001256times the number of available cores, in case L<Sys::Info>
1257is available.
Akronf7ad89e2016-03-16 18:22:47 +01001258This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001259
Akronf73ffb62018-06-27 12:13:59 +02001260
Akron263274c2019-02-07 09:48:30 +01001261=item B<--koral|-k>
1262
1263Version of the output format. Supported versions are:
1264C<0> for legacy serialization, C<0.03> for serialization
1265with metadata fields as key-values on the root object,
1266C<0.4> for serialization with metadata fields as a list
1267of C<"@type":"koral:field"> objects.
1268
1269Currently defaults to C<0.03>.
1270
1271
Akron9ec88872017-04-12 16:29:06 +02001272=item B<--sequential-extraction|-se>
1273
1274Flag to indicate, if the C<jobs> value also applies to extraction.
1275Some systems may have problems with extracting multiple archives
1276to the same folder at the same time.
1277Can be flagged using C<--no-sequential-extraction> as well.
1278Defaults to C<false>.
1279
Akronf73ffb62018-06-27 12:13:59 +02001280
Akron35db6e32016-03-17 22:42:22 +01001281=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001282
Akron35db6e32016-03-17 22:42:22 +01001283Define the metadata parser to use. Defaults to C<I5>.
1284Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1285This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001286
Akronf73ffb62018-06-27 12:13:59 +02001287
Akron941c1a62016-02-23 17:41:41 +01001288=item B<--gzip|-z>
1289
Akronf7ad89e2016-03-16 18:22:47 +01001290Compress the output.
1291Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001292
Akronf73ffb62018-06-27 12:13:59 +02001293
Akron11c80302016-03-18 19:44:43 +01001294=item B<--cache|-c>
1295
1296File to mmap a cache (using L<Cache::FastMmap>).
1297Defaults to C<korapxml2krill.cache> in the calling directory.
1298
Akronf73ffb62018-06-27 12:13:59 +02001299
Akron11c80302016-03-18 19:44:43 +01001300=item B<--cache-size|-cs>
1301
1302Size of the cache. Defaults to C<50m>.
1303
Akronf73ffb62018-06-27 12:13:59 +02001304
Akron11c80302016-03-18 19:44:43 +01001305=item B<--cache-init|-ci>
1306
1307Initialize cache file.
1308Can be flagged using C<--no-cache-init> as well.
1309Defaults to C<true>.
1310
Akronf73ffb62018-06-27 12:13:59 +02001311
Akron11c80302016-03-18 19:44:43 +01001312=item B<--cache-delete|-cd>
1313
1314Delete cache file after processing.
1315Can be flagged using C<--no-cache-delete> as well.
1316Defaults to C<true>.
1317
Akronf73ffb62018-06-27 12:13:59 +02001318
Akron636aa112017-04-07 18:48:56 +02001319=item B<--config|-cfg>
1320
1321Configure the parameters of your call in a file
1322of key-value pairs with whitespace separator
1323
1324 overwrite 1
1325 token DeReKo#Structure
1326 ...
1327
1328Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001329C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001330C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001331C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001332C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001333C<base-sentences>, C<base-paragraphs>,
1334C<base-pagebreaks>,
1335C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001336(semicolon separated), C<anno> (semicolon separated).
1337
Akronf73ffb62018-06-27 12:13:59 +02001338Configuration parameters will always be overwritten by
1339passed parameters.
1340
1341
Akron81500102017-04-07 20:45:44 +02001342=item B<--temporary-extract|-te>
1343
1344Only valid for the C<archive> command.
1345
1346This will first extract all files into a
1347directory and then will archive.
1348If the directory is given as C<:temp:>,
1349a temporary directory is used.
1350This is especially useful to avoid
1351massive unzipping and potential
1352network latency.
Akron636aa112017-04-07 18:48:56 +02001353
Akronf73ffb62018-06-27 12:13:59 +02001354
Akronc93a0802019-07-11 15:48:34 +02001355=item B<--to-tar>
1356
1357Only valid for the C<archive> command.
1358
1359Writes the output into a tar archive.
1360
1361
Akrone10ad322016-02-27 10:54:26 +01001362=item B<--sigle|-sg>
1363
Akron20807582016-10-26 17:11:34 +02001364Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001365Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001366I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001367Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001368In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001369On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001370
Akronf73ffb62018-06-27 12:13:59 +02001371
Akron941c1a62016-02-23 17:41:41 +01001372=item B<--log|-l>
1373
Akronb9c33812020-10-21 16:19:35 +02001374The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001375
Akronf73ffb62018-06-27 12:13:59 +02001376
Akron941c1a62016-02-23 17:41:41 +01001377=item B<--help|-h>
1378
Akron42f48c12020-02-14 13:08:13 +01001379Print help information.
Akron941c1a62016-02-23 17:41:41 +01001380
Akronf73ffb62018-06-27 12:13:59 +02001381
Akron941c1a62016-02-23 17:41:41 +01001382=item B<--version|-v>
1383
1384Print version information.
1385
1386=back
1387
Akronf73ffb62018-06-27 12:13:59 +02001388
Akronc13a1702016-03-15 19:33:14 +01001389=head1 ANNOTATION SUPPORT
1390
1391L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1392developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1393The base foundry with paragraphs, sentences, and the text element are mandatory for
1394L<Krill|https://github.com/KorAP/Krill>.
1395
Akron821db3d2017-04-06 21:19:31 +02001396 Base
1397 #Paragraphs
1398 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001399
Akron821db3d2017-04-06 21:19:31 +02001400 Connexor
1401 #Morpho
1402 #Phrase
1403 #Sentences
1404 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001405
Akron821db3d2017-04-06 21:19:31 +02001406 CoreNLP
1407 #Constituency
1408 #Morpho
1409 #NamedEntities
1410 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001411
Akronce125b62017-06-19 11:54:36 +02001412 CMC
1413 #Morpho
1414
Akron821db3d2017-04-06 21:19:31 +02001415 DeReKo
1416 #Structure
Akronc13a1702016-03-15 19:33:14 +01001417
Akron57510c12019-01-04 14:58:53 +01001418 DGD
1419 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001420 #Structure
Akron57510c12019-01-04 14:58:53 +01001421
Akron821db3d2017-04-06 21:19:31 +02001422 DRuKoLa
1423 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001424
Akron821db3d2017-04-06 21:19:31 +02001425 Glemm
1426 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001427
Akronabb36902021-10-11 15:51:06 +02001428 Gingko
1429 #Morpho
1430
Akronea1aed52018-07-19 14:43:34 +02001431 HNC
1432 #Morpho
1433
Akron4c679192018-01-16 17:41:49 +01001434 LWC
1435 #Dependency
1436
Akron821db3d2017-04-06 21:19:31 +02001437 Malt
1438 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001439
Akron821db3d2017-04-06 21:19:31 +02001440 MarMoT
1441 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001442
Akron821db3d2017-04-06 21:19:31 +02001443 Mate
1444 #Dependency
1445 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001446
Akron821db3d2017-04-06 21:19:31 +02001447 MDParser
1448 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001449
Akron821db3d2017-04-06 21:19:31 +02001450 OpenNLP
1451 #Morpho
1452 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001453
Akron07e24772020-04-23 14:00:54 +02001454 RWK
1455 #Morpho
1456 #Structure
1457
Akron821db3d2017-04-06 21:19:31 +02001458 Sgbr
1459 #Lemma
1460 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001461
Akron7d5e6382019-08-08 16:36:27 +02001462 Talismane
1463 #Dependency
1464 #Morpho
1465
Akron821db3d2017-04-06 21:19:31 +02001466 TreeTagger
1467 #Morpho
1468 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001469
Akron821db3d2017-04-06 21:19:31 +02001470 XIP
1471 #Constituency
1472 #Morpho
1473 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001474
Akronc13a1702016-03-15 19:33:14 +01001475
1476More importers are in preparation.
1477New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1478See the built-in annotation importers as examples.
1479
Akronf73ffb62018-06-27 12:13:59 +02001480
Akron41e6c8b2021-10-14 20:22:18 +02001481=head1 METADATA SUPPORT
1482
1483L<KorAP::XML::Krill> has built-in importer for some meta data variants
1484developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1485
1486=over 2
1487
1488=item I5 - Meta data for all I5 files
1489
1490=item Sgbr - Meta data from the Schreibgebrauch project
1491
1492=item Gingko - Meta data from the Gingko project in addition to I5
1493
1494=back
1495
1496More importers are in preparation.
1497New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1498See the built-in meta data importers as examples.
1499
1500
Akron8f69d632020-01-15 16:58:11 +01001501=head1 About KorAP-XML
1502
1503KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1504data model (Bański et al. 2013), where text data are stored physically
1505separated from their interpretations (i.e. annotations).
1506A text document in KorAP-XML therefore consists of several files
1507containing primary data, metadata and annotations.
1508
1509The structure of a single KorAP-XML document can be as follows:
1510
1511 - data.xml
1512 - header.xml
1513 + base
1514 - tokens.xml
1515 - ...
1516 + struct
1517 - structure.xml
1518 - ...
1519 + corenlp
1520 - morpho.xml
1521 - constituency.xml
1522 - ...
1523 + tree_tagger
1524 - morpho.xml
1525 - ...
1526 - ...
1527
1528The C<data.xml> contains the primary data, the C<header.xml> contains
1529the metadata, and the annotation layers are stored in subfolders
1530like C<base>, C<struct> or C<corenlp>
1531(so-called "foundries"; Bański et al. 2013).
1532
1533Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001534(Lüngen and Sperberg-McQueen 2012). See the documentation in
1535L<KorAP::XML::Meta::I5> for translatable fields.
1536
1537Annotations correspond to a variant of the TEI-P5 feature structures
1538(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001539Annotation feature structures refer to character sequences of the primary text
1540inside the C<text> element of the C<data.xml>.
1541A single annotation containing the lemma of a token can have the following structure:
1542
1543 <span from="0" to="3">
1544 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1545 <f name="lex">
1546 <fs>
1547 <f name="lemma">zum</f>
1548 </fs>
1549 </f>
1550 </fs>
1551 </span>
1552
1553The C<from> and C<to> attributes are refering to the character span
1554in the primary text.
1555Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1556the structure may vary. See L<KorAP::XML::Annotation::*> for various
1557annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001558
1559Multiple KorAP-XML documents are organized on three levels following
1560the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1561corpus E<gt> document E<gt> text. On each level metadata information
1562can be stored, that C<korapxml2krill> will merge to a single metadata
1563object per text. A corpus is therefore structured as follows:
1564
1565 + <corpus>
1566 - header.xml
1567 + <document>
1568 - header.xml
1569 + <text>
1570 - data.xml
1571 - header.xml
1572 - ...
1573 - ...
1574
1575A single text can be identified by the concatenation of
1576the corpus identifier, the document identifier and the text identifier.
1577This identifier is called the text sigle
1578(e.g. a text with the identifier C<18486> in the document C<060> in the
1579corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1580
1581These corpora are often stored in zip files, with which C<korapxml2krill>
1582can deal with. Corpora may also be split in multiple zip archives
1583(e.g. one zip file per foundry), which is also supported (see C<--input>).
1584
1585Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1586in form of a test suite.
1587The resulting JSON format merges all annotation layers
1588based on a single token stream.
1589
1590=head2 References
1591
1592Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1593KorAP data model: first approximation, December.
1594
1595Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1596"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1597Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1598L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1599
1600Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1601"Robust corpus architecture: a new look at virtual collections and data access",
1602Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1603L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1604
1605Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1606Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1607"Towards an international standard on featurestructure representation",
1608Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1609pp. 373-376.
1610L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1611
1612Harald Lüngen and C. M. Sperberg-McQueen (2012):
1613"A TEI P5 Document Grammar for the IDS Text Model",
1614Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1615L<PDF|https://journals.openedition.org/jtei/pdf/508>
1616
1617TEI Consortium, eds:
1618"Feature Structures",
1619Guidelines for Electronic Text Encoding and Interchange.
1620L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1621
Akron941c1a62016-02-23 17:41:41 +01001622=head1 AVAILABILITY
1623
1624 https://github.com/KorAP/KorAP-XML-Krill
1625
1626
1627=head1 COPYRIGHT AND LICENSE
1628
Akron9a2545e2022-01-16 15:15:50 +01001629Copyright (C) 2015-2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001630
Akron6882d7d2021-02-08 09:43:57 +01001631Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001632
Akrona76d8352016-10-27 16:27:32 +02001633Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001634
Akron6882d7d2021-02-08 09:43:57 +01001635L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001636Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001637L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001638member of the
Akronf1849aa2019-12-16 23:35:33 +01001639L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001640
1641This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001642L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001643
1644=cut