blob: a1dac4d938df044a3c29bb810a5d6f8546329efd [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010019use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020020use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020021use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010022use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020023use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020024use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020025use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020026use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020027use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020029use IO::File;
30use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron941c1a62016-02-23 17:41:41 +0100160# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100161
Akron9a2545e2022-01-16 15:15:50 +0100162our $LAST_CHANGE = '2022/01/17';
Akron941c1a62016-02-23 17:41:41 +0100163our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100164our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100165our $VERSION_MSG = <<"VERSION";
166Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
167VERSION
168
Akron941c1a62016-02-23 17:41:41 +0100169# Parse comand
170my $cmd;
171our @ARGV;
172if ($ARGV[0] && index($ARGV[0], '-') != 0) {
173 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100174};
Akron63f20d42017-04-10 23:40:29 +0200175my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100176
Akron5f51d422016-08-16 16:26:43 +0200177my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200178
179# Configuration hash
180my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100181
Akron941c1a62016-02-23 17:41:41 +0100182# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000183GetOptions(
Akron08385f62016-03-22 20:37:04 +0100184 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200185 'input-base|ib=s' => \($cfg{input_base}),
186 'output|o=s' => \($cfg{output}),
187 'overwrite|w' => \($cfg{overwrite}),
188 'meta|m=s' => \($cfg{meta}),
189 'token|t=s' => \($cfg{token}),
190 'base-sentences|bs=s' => \($cfg{base_sentences}),
191 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
192 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
193 'gzip|z' => \($cfg{gzip}),
194 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100195 'skip|s=s' => \@skip,
196 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200197 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200198 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200199 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200200 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200201 'primary|p!' => sub {
202 warn 'Primary flag no longer supported!';
203 },
Akron6aed0562020-08-07 16:46:00 +0200204 'pretty|y' => sub {
205 warn 'Pretty flag no longer supported!';
206 },
Akronf8df2162020-08-07 15:03:39 +0200207 'jobs|j=i' => \($cfg{jobs}),
208 'koral|k=f' => \($cfg{koral}),
209 'to-tar' => \($cfg{to_tar}),
210 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
211 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
212 'sequential-extraction|se' => \($cfg{sequential_extraction}),
213 'cache-size|cs=s' => \($cfg{cache_size}),
214 'cache-delete|cd!' => \($cfg{cache_delete}),
215 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100216 'help|h' => sub {
217 pod2usage(
218 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200219 -verbose => 99,
220 -msg => $VERSION_MSG,
221 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100222 );
223 },
224 'version|v' => sub {
225 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200226 -verbose => 0,
227 -msg => $VERSION_MSG,
228 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100229 )
230 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000231);
232
Akrone512b7c2020-08-07 16:16:12 +0200233my %ERROR_HASH = (
234 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
235 -verbose => 99,
236 -msg => $VERSION_MSG,
237 -output => '-',
238 -exit => 1
239);
Akron63f20d42017-04-10 23:40:29 +0200240
Akronf8df2162020-08-07 15:03:39 +0200241# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200242if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200243 my %config;
244
Akronf8df2162020-08-07 15:03:39 +0200245 print "Reading config from $cfg_file\n";
246
Akron636aa112017-04-07 18:48:56 +0200247 Config::Simple->import_from($cfg_file, \%config);
248
Akronf8df2162020-08-07 15:03:39 +0200249 foreach (qw!output cache-size input-base token overwrite
250 meta base-sentences base-paragraphs base-pagebreaks
251 gzip to-tar log cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100252 non-verbal-tokens sequential-extraction
253 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200254 koral extract-dir jobs!) {
255 my $underlined = $_ =~ tr/-/_/r;
256 if (!defined($cfg{$underlined}) && defined $config{$_}) {
257 $cfg{$underlined} = $config{$_};
258 };
Akron636aa112017-04-07 18:48:56 +0200259 };
260
261 # Skip
262 if (!scalar(@skip) && defined $config{'skip'}) {
263 @skip = split /\s*;\s*/, $config{'skip'} ;
264 };
265
266 # Sigle
267 if (!scalar(@sigle) && defined $config{'sigle'}) {
268 @sigle = split /\s*;\s*/, $config{'sigle'} ;
269 };
270
271 # Anno
272 if (!scalar(@anno) && defined $config{'anno'}) {
273 @anno = split /\s*;\s*/, $config{'anno'} ;
274 };
275};
276
Akronf8df2162020-08-07 15:03:39 +0200277# Init variables and set default values
278my $output = $cfg{output};
279my $input_base = $cfg{input_base};
280my $gzip = $cfg{gzip};
281my $to_tar = $cfg{to_tar};
282my $extract_dir = $cfg{extract_dir};
283my $token_base = $cfg{token} // 'OpenNLP#tokens';
284my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
285my $jobs = $cfg{jobs} // 0;
286my $cache_delete = $cfg{cache_delete} // 1;
287my $base_sentences = lc($cfg{base_sentences} // '');
288my $base_paragraphs = lc($cfg{base_paragraphs} // '');
289my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
290my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200291
Akronf8df2162020-08-07 15:03:39 +0200292# Get tokenization basis
293my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200294
Akronf8df2162020-08-07 15:03:39 +0200295# Remove file extension
296$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100297
Akronf8df2162020-08-07 15:03:39 +0200298# Convert sigle to path construct
299s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
300
301my %skip;
302$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200303
Akronb9c33812020-10-21 16:19:35 +0200304Log::Any::Adapter->set(
305 'Stderr', log_level => uc($cfg{log} // 'ERROR')
306);
Akron63f20d42017-04-10 23:40:29 +0200307
Akronf8df2162020-08-07 15:03:39 +0200308if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
309 $log->error("Directory '$output' does not exist.");
310 exit 1;
311};
Akron63f20d42017-04-10 23:40:29 +0200312
Akron941c1a62016-02-23 17:41:41 +0100313# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100314pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000315
Akrone1dbc382016-07-08 22:24:52 +0200316# Gzip has no effect, if no output is given
317pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000318
Akronc11f7982017-02-21 21:20:14 +0100319
Akron63f20d42017-04-10 23:40:29 +0200320# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200321if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200322
Akron63f20d42017-04-10 23:40:29 +0200323 # Remove all inputs
324 my $remove_next = 0;
325 @keep_argv = @{c(@keep_argv)->grep(
326 sub {
327 # Input flag
328 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
329 $remove_next = 1;
330 return 0;
331 }
332
333 # input value
334 elsif ($remove_next) {
335 $remove_next = 0;
336 return 0;
337 };
338
339 # Pass parameter
340 return 1;
341 }
342 )->to_array};
343
344
345 # Iterate over all inputs
346 foreach (@input) {
347
Akron081639e2017-04-21 19:01:39 +0200348 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200349 my $new_out = catdir($output, get_file_name_from_glob($_));
350
Akron486f9ab2017-04-22 23:25:19 +0200351 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200352 unless ($to_tar) {
353 if (make_path($new_out) == 0 && !-d $new_out) {
354 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200355 exit 1;
Akron081639e2017-04-21 19:01:39 +0200356 };
Akron63f20d42017-04-10 23:40:29 +0200357 };
358
359 # Create archive command
360 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
361 print "Start serial processing of $_ to $new_out\n";
362
363 # Start archiving
364 system @archive_cmd;
365 };
366
Akron3abc03e2017-06-29 16:23:35 +0200367 exit;
Akron63f20d42017-04-10 23:40:29 +0200368};
369
Akron5c602cb2020-08-07 17:00:52 +0200370# Define supported (and preinstalled) transformation modules
371my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100372push(@layers, ['Base', 'Sentences']) unless $base_sentences;
373push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200374
375# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200376push(@layers, ['Connexor', 'Morpho'],
377 ['Connexor', 'Syntax'],
378 ['Connexor', 'Phrase'],
379 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200380
381# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200382push(@layers,
383 ['CoreNLP', 'NamedEntities'],
384 ['CoreNLP', 'Sentences'],
385 ['CoreNLP', 'Morpho'],
386 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200387
Akronce125b62017-06-19 11:54:36 +0200388# CMC
389push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100390
Akrone1dbc382016-07-08 22:24:52 +0200391# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100392my @dereko_attr = ();
393if ($base_sentences eq 'dereko#structure') {
394 push @dereko_attr, 'sentences';
395};
396if ($base_paragraphs eq 'dereko#structure') {
397 push @dereko_attr, 'paragraphs';
398};
Akron636bd9c2017-02-09 17:13:00 +0100399
Akron41ac10b2017-02-08 22:47:25 +0100400if ($base_pagebreaks eq 'dereko#structure') {
401 push @dereko_attr, 'pagebreaks';
402};
403
404if ($dereko_attr[0]) {
405 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100406}
407else {
408 push(@layers, ['DeReKo', 'Structure']);
409};
Akrone1dbc382016-07-08 22:24:52 +0200410
Akron57510c12019-01-04 14:58:53 +0100411# DGD
412push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100413if ($base_sentences eq 'dgd#structure') {
414 push(@layers, ['DGD', 'Structure', 'base-sentence']);
415}
Akron57510c12019-01-04 14:58:53 +0100416
417# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200418push(@layers,
419 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100420
Akronabb36902021-10-11 15:51:06 +0200421# Gingko
422push(@layers,
423 ['Gingko', 'Morpho']);
424
Akrone1dbc382016-07-08 22:24:52 +0200425# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200426push(@layers,
427 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200428
Akronea1aed52018-07-19 14:43:34 +0200429# HNC
Akron5c602cb2020-08-07 17:00:52 +0200430push(@layers,
431 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200432
Akron4c679192018-01-16 17:41:49 +0100433# LWC
Akron5c602cb2020-08-07 17:00:52 +0200434push(@layers,
435 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100436
Akrone1dbc382016-07-08 22:24:52 +0200437# Malt
Akron5c602cb2020-08-07 17:00:52 +0200438push(@layers,
439 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200440
Akron57510c12019-01-04 14:58:53 +0100441# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200442push(@layers,
443 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200444
445# Mate
Akron5c602cb2020-08-07 17:00:52 +0200446push(@layers,
447 ['Mate', 'Morpho'],
448 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200449
Akron57510c12019-01-04 14:58:53 +0100450# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200451push(@layers,
452 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100453
Akrone1dbc382016-07-08 22:24:52 +0200454# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200455push(@layers,
456 ['OpenNLP', 'Morpho'],
457 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200458
Akron07e24772020-04-23 14:00:54 +0200459# Redewiedergabe
460push(@layers, ['RWK', 'Morpho']);
461if ($base_sentences eq 'rwk#structure') {
462 push(@layers, ['RWK', 'Structure']);
463};
464
Akrone1dbc382016-07-08 22:24:52 +0200465# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200466push(@layers,
467 ['Sgbr', 'Lemma'],
468 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200469
Akron7d5e6382019-08-08 16:36:27 +0200470# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200471push(@layers,
472 ['Talismane', 'Dependency'],
473 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200474
Akrone1dbc382016-07-08 22:24:52 +0200475# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200476push(@layers,
477 ['TreeTagger', 'Morpho'],
478 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200479
480# XIP
Akron5c602cb2020-08-07 17:00:52 +0200481push(@layers,
482 ['XIP', 'Morpho'],
483 ['XIP', 'Constituency'],
484 ['XIP', 'Sentences'],
485 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200486
Akron4fa37c32017-01-20 14:43:10 +0100487
Akrone1dbc382016-07-08 22:24:52 +0200488# Check filters
489my @filtered_anno;
490if ($skip{'#all'}) {
491 foreach (@anno) {
492 push @filtered_anno, [ split('#', $_) ];
493 };
494}
495
496# Add all annotations that are not skipped
497else {
498 # Add to index file - respect skipping
499 foreach my $info (@layers) {
500 # Skip if Foundry or Foundry#Layer should be skipped
501 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
502 push @filtered_anno, $info;
503 };
504 };
505};
506
Akrone1dbc382016-07-08 22:24:52 +0200507
508# TODO: This should not be initialized for batch
509my $cache = Cache::FastMmap->new(
510 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200511 cache_size => ($cfg{cache_size} // '50m'),
512 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200513);
514
Akron03b24db2016-08-16 20:54:32 +0200515# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200516my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200517 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200518 meta_type => $cfg{meta},
519 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200520 foundry => $token_base_foundry,
521 layer => $token_base_layer,
522 gzip => $gzip,
523 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200524 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100525 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200526 non_word_tokens => ($cfg{non_word_tokens} // 0),
527 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200528);
529
Akrone512b7c2020-08-07 16:16:12 +0200530
531# Auto adjust jobs
532if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100533 my $cores = 1;
534 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
535 $cores = Sys::Info->new->device('CPU')->count;
536 }
537 else {
538 $log->warn("Unable to determine number of cores");
539 };
540
Akrone512b7c2020-08-07 16:16:12 +0200541 $jobs = ceil(5 * $cores);
542 $log->info("Run using $jobs jobs on $cores cores");
543};
544
545
Akron63f20d42017-04-10 23:40:29 +0200546# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200547if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200548
Akron821db3d2017-04-06 21:19:31 +0200549 my @new_input = ();
550
551 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200552 foreach my $wild_card (@input) {
553
554 # Prefix with input root
555 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
556
557 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200558 };
559
Akron63f20d42017-04-10 23:40:29 +0200560 # Sort files by length
561 @input = sort { length($a) <=> length($b) } @new_input;
562
563 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200564};
565
566
Akron941c1a62016-02-23 17:41:41 +0100567# Process a single file
568unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100569 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000570
Akron941c1a62016-02-23 17:41:41 +0100571 BEGIN {
572 $main::TIME = Benchmark->new;
573 $main::LAST_STOP = Benchmark->new;
574 };
575
576 sub stop_time {
577 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200578 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100579 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200580 timestr(timediff($new, $main::LAST_STOP)) .
581 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
582 );
Akron941c1a62016-02-23 17:41:41 +0100583 $main::LAST_STOP = $new;
584 };
585
586 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200587 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100588
Akron7d4cdd82016-08-17 21:39:45 +0200589 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200590 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100591
Akron11c80302016-03-18 19:44:43 +0100592 # Delete cache file
593 unlink($cache_file) if $cache_delete;
594
Akron5f51d422016-08-16 16:26:43 +0200595 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200596 exit;
Akron81500102017-04-07 20:45:44 +0200597};
598
Nils Diewald59094f22014-11-05 18:20:50 +0000599
Akrone10ad322016-02-27 10:54:26 +0100600# Extract XML files
Akron81500102017-04-07 20:45:44 +0200601if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100602
Akrond5643ad2017-07-04 20:27:13 +0200603 # Output is required
604 pod2usage(%ERROR_HASH) unless $output;
605
Akron7d4cdd82016-08-17 21:39:45 +0200606 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200607 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100608
Akron7d4cdd82016-08-17 21:39:45 +0200609 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100610 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200611 $log->error("Unzip is not installed or incompatible.");
612 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100613 };
614
Akronb0c88db2016-06-29 16:33:18 +0200615 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200616 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200617
Akron31a08cb2019-02-20 20:43:26 +0100618 # Will set @sigle
619 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200620
Akrone10ad322016-02-27 10:54:26 +0100621 # Iterate over all given sigles and extract
622 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100623
Akron2812ba22016-10-28 21:55:59 +0200624 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200625
Akron03b24db2016-08-16 20:54:32 +0200626 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200627 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100628
Akron955b75b2019-02-21 14:28:41 +0100629 # TODO:
630 # - prefix???
631 $archive->extract_sigle([$_], $output, $jobs)
632 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200633 );
Akrone10ad322016-02-27 10:54:26 +0100634 print "extracted.\n";
635 };
Akronb0c88db2016-06-29 16:33:18 +0200636 }
Akron7d4cdd82016-08-17 21:39:45 +0200637
638 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200639 else {
640 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200641 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100642 };
643}
644
Akron81500102017-04-07 20:45:44 +0200645
Akron941c1a62016-02-23 17:41:41 +0100646# Process an archive
647elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000648
Akron81500102017-04-07 20:45:44 +0200649 my $archive_output;
650
651 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100652 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200653
654 # Create new archive object
655 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
656
657 # Check zip capabilities
658 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200659 $log->error("Unzip is not installed or incompatible.");
660 exit 1;
Akron81500102017-04-07 20:45:44 +0200661 };
662
663 # Add further annotation archived
664 $archive->attach($_) foreach @input[1..$#input];
665
666 # Create a temporary directory
667 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200668 $extract_dir = tempdir(CLEANUP => 0);
669 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200670 };
671
Akron63f20d42017-04-10 23:40:29 +0200672 # Add some random extra to avoid clashes with multiple archives
673 $extract_dir = catdir($extract_dir, random_string('cccccc'));
674
Akron31a08cb2019-02-20 20:43:26 +0100675 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200676 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200677 @input = ($extract_dir);
678 }
679 else {
680 $log->error('Unable to extract from primary archive ' . $input[0] .
681 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200682 exit 1;
Akron81500102017-04-07 20:45:44 +0200683 };
684 }
685
686 # Can't create archive object
687 else {
688 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200689 exit 1;
Akron81500102017-04-07 20:45:44 +0200690 };
691 };
692
Akron7d4cdd82016-08-17 21:39:45 +0200693 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100694 my $pool = Parallel::ForkManager->new($jobs);
695
Akron7d4cdd82016-08-17 21:39:45 +0200696 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100697 my $iter = 1; # Current text in process
698
Akronda3097e2017-04-23 19:53:57 +0200699 my $tar_archive;
700 my $output_dir = $output;
701 my $tar_fh;
702
703 # Initialize tar archive
704 if ($to_tar) {
705 $tar_archive = Archive::Tar::Builder->new(
706 ignore_errors => 1
707 );
708
709 # Set output name
710 my $tar_file = $output;
711 unless ($tar_file =~ /\.tar$/) {
712 $tar_file .= '.tar';
713 };
714
715 # Initiate the tar file
716 print "Writing to file $tar_file\n";
717 $tar_fh = IO::File->new($tar_file, 'w');
718 $tar_fh->binmode(1);
719
720 # Set handle
721 $tar_archive->set_handle($tar_fh);
722
723 # Output to temporary directory
724 $output_dir = File::Temp->newdir;
725 };
726
Akron941c1a62016-02-23 17:41:41 +0100727 # Report on fork message
728 $pool->run_on_finish (
729 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200730 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100731 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200732
Akron08385f62016-03-22 20:37:04 +0100733 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200734 ($iter++) . "/$count]" .
735 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200736 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200737
738 if (!$code && $to_tar && $data->[2]) {
739 my $filename = $data->[2];
740
741 # Lock filehandle
742 if (flock($tar_fh, LOCK_EX)) {
743
Akron9a062ce2017-07-04 19:12:05 +0200744 my $clean_file = fileparse($filename);
745
Akronda3097e2017-04-23 19:53:57 +0200746 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200747 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200748 unlink $filename;
749
750 # Unlock filehandle
751 flock($tar_fh, LOCK_UN);
752 }
753 else {
754 $log->warn("Unable to add $filename to archive");
755 };
756 };
757
Akron4c0cf312016-10-15 16:42:09 +0200758 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100759 }
760 );
761
762 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200763 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100764 print "Reading data ...\n";
765
Akron7d4cdd82016-08-17 21:39:45 +0200766 # unless (Cache::FastMmap->new(
767 # share_file => $cache_file,
768 # cache_size => $cache_size,
769 # init_file => $cache_init
770 # )) {
771 # print "Unable to intialize cache '$cache_file'\n\n";
772 # exit(1);
773 # };
Akron11c80302016-03-18 19:44:43 +0100774
Akron486f9ab2017-04-22 23:25:19 +0200775
Akron941c1a62016-02-23 17:41:41 +0100776 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100777 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200778 # TODO:
779 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100780 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100781 my @dirs;
782 my $dir;
783
Akron7d4cdd82016-08-17 21:39:45 +0200784 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100785 while (1) {
786 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200787 push @dirs, $dir;
788 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100789 };
790 last unless $it->next;
791 };
792
793 print "Start processing ...\n";
794 $t = Benchmark->new;
795 $count = scalar @dirs;
796
797 DIRECTORY_LOOP:
798 for (my $i = 0; $i < $count; $i++) {
799
Akrone1dbc382016-07-08 22:24:52 +0200800 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200801 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200802 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200803 );
Akron941c1a62016-02-23 17:41:41 +0100804
805 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200806 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200807
Akron13d56622016-10-31 14:54:49 +0100808 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200809 $pool->finish(
810 0,
Akronda3097e2017-04-23 19:53:57 +0200811 [
812 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
813 undef,
814 $filename
815 ]
Akron486f9ab2017-04-22 23:25:19 +0200816 );
Akron3ec48972016-08-17 23:24:52 +0200817 }
818 else {
Akron4c0cf312016-10-15 16:42:09 +0200819 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200820 };
Akron941c1a62016-02-23 17:41:41 +0100821 };
822 }
823
824 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200825 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200826
Akron941c1a62016-02-23 17:41:41 +0100827 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200828 $log->error("Unzip is not installed or incompatible.");
829 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100830 };
831
Akron08385f62016-03-22 20:37:04 +0100832 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200833 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100834
Akron31a08cb2019-02-20 20:43:26 +0100835 # Get sigles to extract
836 my $prefix = set_sigle($archive);
837
Akron941c1a62016-02-23 17:41:41 +0100838 print "Start processing ...\n";
839 $t = Benchmark->new;
840 my @dirs = $archive->list_texts;
841 $count = scalar @dirs;
842
843 ARCHIVE_LOOP:
844 for (my $i = 0; $i < $count; $i++) {
845
846 # Split path information
847 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
848
Akrone1dbc382016-07-08 22:24:52 +0200849 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200850 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200851 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200852 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200853 catfile($corpus, $doc, $text)
854 . '.json' . ($gzip ? '.gz' : '')
855 )
Akrone1dbc382016-07-08 22:24:52 +0200856 );
Akron941c1a62016-02-23 17:41:41 +0100857
858 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200859 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100860
Akron4c0cf312016-10-15 16:42:09 +0200861 # Create temporary file
862 $temp = File::Temp->newdir;
863
Akronbdf434a2016-10-24 17:42:07 +0200864 # TODO: Check if $filename exist at the beginning,
865 # because extraction can be horrible slow!
866
Akron941c1a62016-02-23 17:41:41 +0100867 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100868 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100869
Akron7d4cdd82016-08-17 21:39:45 +0200870 # Create corpus directory
871 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100872
Akron7d4cdd82016-08-17 21:39:45 +0200873 # Temporary directory
874 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100875
Akron7d4cdd82016-08-17 21:39:45 +0200876 # Write file
Akron13d56622016-10-31 14:54:49 +0100877 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200878
Akron4c0cf312016-10-15 16:42:09 +0200879 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100880 $pool->finish(
881 0,
Akronda3097e2017-04-23 19:53:57 +0200882 [
883 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
884 $temp,
885 $filename
886 ]
Akron13d56622016-10-31 14:54:49 +0100887 );
Akron7d4cdd82016-08-17 21:39:45 +0200888 }
889 else {
Akron4c0cf312016-10-15 16:42:09 +0200890 # Delete temporary file
891 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200892 };
Akron941c1a62016-02-23 17:41:41 +0100893 }
Akron7d4cdd82016-08-17 21:39:45 +0200894
895 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100896 else {
Akron4c0cf312016-10-15 16:42:09 +0200897 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100898 };
899 };
900 }
901
902 else {
903 print "Input is neither a directory nor an archive.\n\n";
904 };
905
906 $pool->wait_all_children;
907
Akron11c80302016-03-18 19:44:43 +0100908 # Delete cache file
909 unlink($cache_file) if $cache_delete;
910
Akronda3097e2017-04-23 19:53:57 +0200911 # Close tar filehandle
912 if ($to_tar && $tar_fh) {
913 $tar_archive->finish;
914 $tar_fh->close;
915 print "Wrote to tar archive.\n";
916 };
917
Akron63f20d42017-04-10 23:40:29 +0200918 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100919 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200920};
Akron941c1a62016-02-23 17:41:41 +0100921
Nils Diewald2db9ad02013-10-29 19:26:43 +0000922
Akron31a08cb2019-02-20 20:43:26 +0100923# For an archive, this will create the list
924# of all sigles to process
925sub set_sigle {
926 my $archive = shift;
927
928 my $prefix = 1;
929 my @dirs = ();
930
931 # No sigles given
932 unless (@sigle) {
933
934 # Get files
935 foreach ($archive->list_texts) {
936
937 push @dirs, $_;
938
939 # Split path information
940 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
941
942 # TODO: Make this OS independent
943 push @sigle, join '/', $corpus, $doc, $text;
944 };
945 }
946
947 # Check sigle for doc sigles
948 else {
949 my @new_sigle;
950
951 my $prefix_check = 0;
952
953 # Iterate over all sigle
954 foreach (@sigle) {
955
956 # Sigle is a doc sigle
957 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
958
959 print "$_ ...";
960 # Check if a prefix is needed
961 unless ($prefix_check) {
962
963 if ($prefix = $archive->check_prefix) {
964 print " with prefix ...";
965 };
966 $prefix_check = 1;
967 };
968
969 print "\n";
970
Akron31a08cb2019-02-20 20:43:26 +0100971 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100972 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
973 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100974 );
975 print "extracted.\n";
976 }
977
978 # Sigle is a text sigle
979 else {
980 push @new_sigle, $_;
981
982 unless ($prefix_check) {
983
984 if ($prefix = $archive->check_prefix) {
985 print " with prefix ...";
986 };
987 $prefix_check = 1;
988 };
989 };
990 };
991 @sigle = @new_sigle;
992 };
993
994 return $prefix;
995};
996
997
Akron63f20d42017-04-10 23:40:29 +0200998# Cleanup temporary extraction directory
999if ($extract_dir) {
1000 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001001 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001002};
1003
1004
1005print "\n";
1006
Nils Diewald2db9ad02013-10-29 19:26:43 +00001007__END__
Akron941c1a62016-02-23 17:41:41 +01001008
1009=pod
1010
1011=encoding utf8
1012
1013=head1 NAME
1014
Akron42f48c12020-02-14 13:08:13 +01001015korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001016
1017
1018=head1 SYNOPSIS
1019
Akrona76d8352016-10-27 16:27:32 +02001020 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001021
Akron2fd402b2016-10-27 21:26:48 +02001022
Akron941c1a62016-02-23 17:41:41 +01001023=head1 DESCRIPTION
1024
1025L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1026compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001027The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001028
1029
1030=head1 INSTALLATION
1031
1032The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1033
Akronaf386982016-10-12 00:33:25 +02001034 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001035
Akronc13a1702016-03-15 19:33:14 +01001036In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001037be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001038Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akron0b04b312020-10-30 17:39:18 +01001039Optional support for L<Sys::Info> to calculate available cores.
Akrona93d51b2016-10-24 20:27:48 +02001040In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001041
1042=head1 ARGUMENTS
1043
Akrona76d8352016-10-27 16:27:32 +02001044 $ korapxml2krill -z --input <directory> --output <filename>
1045
1046Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001047It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001048
Akron941c1a62016-02-23 17:41:41 +01001049=over 2
1050
1051=item B<archive>
1052
Akron081639e2017-04-21 19:01:39 +02001053 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001054
Akron2fd402b2016-10-27 21:26:48 +02001055Converts an archive of KorAP-XML documents. It expects a directory
1056(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001057
1058=item B<extract>
1059
Akrona76d8352016-10-27 16:27:32 +02001060 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1061
1062Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001063
Akron63f20d42017-04-10 23:40:29 +02001064=item B<serial>
1065
1066 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1067
1068Convert archives sequentially. The inputs are not merged but treated
1069as they are (so they may be premerged or globs).
1070the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001071are created based on the archive name. In case the C<--to-tar> flag is given,
1072the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001073
1074
Akron941c1a62016-02-23 17:41:41 +01001075=back
1076
1077
1078=head1 OPTIONS
1079
1080=over 2
1081
Akrona76d8352016-10-27 16:27:32 +02001082=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001083
Akrona76d8352016-10-27 16:27:32 +02001084Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001085
Akron7606afa2016-10-25 16:23:49 +02001086Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001087document, while C<archive> expects a KorAP-XML corpus folder or a zip
1088file to batch process multiple files.
1089C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001090
Akrona76d8352016-10-27 16:27:32 +02001091C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001092that the first archive listed contains all primary data files
1093and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001094
Akron7606afa2016-10-25 16:23:49 +02001095 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001096
Akron821db3d2017-04-06 21:19:31 +02001097Input may also be defined using BSD glob wildcards.
1098
1099 -i 'file/news*.zip'
1100
1101The extended input array will be sorted in length order, so the shortest
1102path needs to contain all primary data files and all meta data files.
1103
Akron0c3e3752016-06-28 15:55:53 +02001104(The directory structure follows the base directory format,
1105that may include a C<.> root folder.
1106In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001107need to be passed with a hash sign in front of the archive's name.
1108This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001109
Akron7606afa2016-10-25 16:23:49 +02001110To support zip files, a version of C<unzip> needs to be installed that is
1111compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001112
Akron7606afa2016-10-25 16:23:49 +02001113B<The root folder switch using the hash sign is experimental and
1114may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001115
Akronf73ffb62018-06-27 12:13:59 +02001116
Akron63f20d42017-04-10 23:40:29 +02001117=item B<--input-base|-ib> <directory>
1118
1119The base directory for inputs.
1120
1121
Akron941c1a62016-02-23 17:41:41 +01001122=item B<--output|-o> <directory|file>
1123
1124Output folder for archive processing or
1125document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001126writes to C<STDOUT> by default
1127(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001128
1129=item B<--overwrite|-w>
1130
1131Overwrite files that already exist.
1132
Akronf73ffb62018-06-27 12:13:59 +02001133
Akron3741f8b2016-12-21 19:55:21 +01001134=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001135
1136Define the default tokenization by specifying
1137the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001138of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001139This will directly take the file instead of running
1140the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001141
Akron3741f8b2016-12-21 19:55:21 +01001142
1143=item B<--base-sentences|-bs> <foundry>#<layer>
1144
1145Define the layer for base sentences.
1146If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001147Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1148layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001149
1150 Defaults to unset.
1151
1152
1153=item B<--base-paragraphs|-bp> <foundry>#<layer>
1154
1155Define the layer for base paragraphs.
1156If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001157Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1158layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001159
1160 Defaults to unset.
1161
1162
Akron41ac10b2017-02-08 22:47:25 +01001163=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1164
1165Define the layer for base pagebreaks.
1166Currently C<DeReKo#Structure> is the only layer supported.
1167
1168 Defaults to unset.
1169
1170
Akron941c1a62016-02-23 17:41:41 +01001171=item B<--skip|-s> <foundry>[#<layer>]
1172
Akronf7ad89e2016-03-16 18:22:47 +01001173Skip specific annotations by specifying the foundry
1174(and optionally the layer with a C<#>-prefix),
1175e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001176Can be set multiple times.
1177
Akronf73ffb62018-06-27 12:13:59 +02001178
Akronc13a1702016-03-15 19:33:14 +01001179=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001180
Akronf7ad89e2016-03-16 18:22:47 +01001181Convert specific annotations by specifying the foundry
1182(and optionally the layer with a C<#>-prefix),
1183e.g. C<Mate> or C<Mate#Morpho>.
1184Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001185
Akronf73ffb62018-06-27 12:13:59 +02001186
Akroned9baf02019-01-22 17:03:25 +01001187=item B<--non-word-tokens|-nwt>
1188
1189Tokenize non-word tokens like word tokens (defined as matching
1190C</[\d\w]/>). Useful to treat punctuations as tokens.
1191
1192 Defaults to unset.
1193
Akronf1849aa2019-12-16 23:35:33 +01001194
1195=item B<--non-verbal-tokens|-nvt>
1196
1197Tokenize non-verbal tokens marked as in the primary data as
1198the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1199
1200 Defaults to unset.
1201
1202
Akron941c1a62016-02-23 17:41:41 +01001203=item B<--jobs|-j>
1204
1205Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001206for archive processing.
Akron11c80302016-03-18 19:44:43 +01001207Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001208
1209If C<sequential-extraction> is not set to false, this will
1210also apply to extraction.
1211
Akronc11f7982017-02-21 21:20:14 +01001212Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001213times the number of available cores, in case L<Sys::Info>
1214is available.
Akronf7ad89e2016-03-16 18:22:47 +01001215This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001216
Akronf73ffb62018-06-27 12:13:59 +02001217
Akron263274c2019-02-07 09:48:30 +01001218=item B<--koral|-k>
1219
1220Version of the output format. Supported versions are:
1221C<0> for legacy serialization, C<0.03> for serialization
1222with metadata fields as key-values on the root object,
1223C<0.4> for serialization with metadata fields as a list
1224of C<"@type":"koral:field"> objects.
1225
1226Currently defaults to C<0.03>.
1227
1228
Akron9ec88872017-04-12 16:29:06 +02001229=item B<--sequential-extraction|-se>
1230
1231Flag to indicate, if the C<jobs> value also applies to extraction.
1232Some systems may have problems with extracting multiple archives
1233to the same folder at the same time.
1234Can be flagged using C<--no-sequential-extraction> as well.
1235Defaults to C<false>.
1236
Akronf73ffb62018-06-27 12:13:59 +02001237
Akron35db6e32016-03-17 22:42:22 +01001238=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001239
Akron35db6e32016-03-17 22:42:22 +01001240Define the metadata parser to use. Defaults to C<I5>.
1241Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1242This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001243
Akronf73ffb62018-06-27 12:13:59 +02001244
Akron941c1a62016-02-23 17:41:41 +01001245=item B<--gzip|-z>
1246
Akronf7ad89e2016-03-16 18:22:47 +01001247Compress the output.
1248Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001249
Akronf73ffb62018-06-27 12:13:59 +02001250
Akron11c80302016-03-18 19:44:43 +01001251=item B<--cache|-c>
1252
1253File to mmap a cache (using L<Cache::FastMmap>).
1254Defaults to C<korapxml2krill.cache> in the calling directory.
1255
Akronf73ffb62018-06-27 12:13:59 +02001256
Akron11c80302016-03-18 19:44:43 +01001257=item B<--cache-size|-cs>
1258
1259Size of the cache. Defaults to C<50m>.
1260
Akronf73ffb62018-06-27 12:13:59 +02001261
Akron11c80302016-03-18 19:44:43 +01001262=item B<--cache-init|-ci>
1263
1264Initialize cache file.
1265Can be flagged using C<--no-cache-init> as well.
1266Defaults to C<true>.
1267
Akronf73ffb62018-06-27 12:13:59 +02001268
Akron11c80302016-03-18 19:44:43 +01001269=item B<--cache-delete|-cd>
1270
1271Delete cache file after processing.
1272Can be flagged using C<--no-cache-delete> as well.
1273Defaults to C<true>.
1274
Akronf73ffb62018-06-27 12:13:59 +02001275
Akron636aa112017-04-07 18:48:56 +02001276=item B<--config|-cfg>
1277
1278Configure the parameters of your call in a file
1279of key-value pairs with whitespace separator
1280
1281 overwrite 1
1282 token DeReKo#Structure
1283 ...
1284
1285Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001286C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001287C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001288C<output>, C<koral>,
1289C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001290C<base-sentences>, C<base-paragraphs>,
1291C<base-pagebreaks>,
1292C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001293(semicolon separated), C<anno> (semicolon separated).
1294
Akronf73ffb62018-06-27 12:13:59 +02001295Configuration parameters will always be overwritten by
1296passed parameters.
1297
1298
Akron81500102017-04-07 20:45:44 +02001299=item B<--temporary-extract|-te>
1300
1301Only valid for the C<archive> command.
1302
1303This will first extract all files into a
1304directory and then will archive.
1305If the directory is given as C<:temp:>,
1306a temporary directory is used.
1307This is especially useful to avoid
1308massive unzipping and potential
1309network latency.
Akron636aa112017-04-07 18:48:56 +02001310
Akronf73ffb62018-06-27 12:13:59 +02001311
Akronc93a0802019-07-11 15:48:34 +02001312=item B<--to-tar>
1313
1314Only valid for the C<archive> command.
1315
1316Writes the output into a tar archive.
1317
1318
Akrone10ad322016-02-27 10:54:26 +01001319=item B<--sigle|-sg>
1320
Akron20807582016-10-26 17:11:34 +02001321Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001322Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001323I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001324Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001325In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001326On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001327
Akronf73ffb62018-06-27 12:13:59 +02001328
Akron941c1a62016-02-23 17:41:41 +01001329=item B<--log|-l>
1330
Akronb9c33812020-10-21 16:19:35 +02001331The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001332
Akronf73ffb62018-06-27 12:13:59 +02001333
Akron941c1a62016-02-23 17:41:41 +01001334=item B<--help|-h>
1335
Akron42f48c12020-02-14 13:08:13 +01001336Print help information.
Akron941c1a62016-02-23 17:41:41 +01001337
Akronf73ffb62018-06-27 12:13:59 +02001338
Akron941c1a62016-02-23 17:41:41 +01001339=item B<--version|-v>
1340
1341Print version information.
1342
1343=back
1344
Akronf73ffb62018-06-27 12:13:59 +02001345
Akronc13a1702016-03-15 19:33:14 +01001346=head1 ANNOTATION SUPPORT
1347
1348L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1349developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1350The base foundry with paragraphs, sentences, and the text element are mandatory for
1351L<Krill|https://github.com/KorAP/Krill>.
1352
Akron821db3d2017-04-06 21:19:31 +02001353 Base
1354 #Paragraphs
1355 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001356
Akron821db3d2017-04-06 21:19:31 +02001357 Connexor
1358 #Morpho
1359 #Phrase
1360 #Sentences
1361 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001362
Akron821db3d2017-04-06 21:19:31 +02001363 CoreNLP
1364 #Constituency
1365 #Morpho
1366 #NamedEntities
1367 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001368
Akronce125b62017-06-19 11:54:36 +02001369 CMC
1370 #Morpho
1371
Akron821db3d2017-04-06 21:19:31 +02001372 DeReKo
1373 #Structure
Akronc13a1702016-03-15 19:33:14 +01001374
Akron57510c12019-01-04 14:58:53 +01001375 DGD
1376 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001377 #Structure
Akron57510c12019-01-04 14:58:53 +01001378
Akron821db3d2017-04-06 21:19:31 +02001379 DRuKoLa
1380 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001381
Akron821db3d2017-04-06 21:19:31 +02001382 Glemm
1383 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001384
Akronabb36902021-10-11 15:51:06 +02001385 Gingko
1386 #Morpho
1387
Akronea1aed52018-07-19 14:43:34 +02001388 HNC
1389 #Morpho
1390
Akron4c679192018-01-16 17:41:49 +01001391 LWC
1392 #Dependency
1393
Akron821db3d2017-04-06 21:19:31 +02001394 Malt
1395 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001396
Akron821db3d2017-04-06 21:19:31 +02001397 MarMoT
1398 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001399
Akron821db3d2017-04-06 21:19:31 +02001400 Mate
1401 #Dependency
1402 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001403
Akron821db3d2017-04-06 21:19:31 +02001404 MDParser
1405 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001406
Akron821db3d2017-04-06 21:19:31 +02001407 OpenNLP
1408 #Morpho
1409 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001410
Akron07e24772020-04-23 14:00:54 +02001411 RWK
1412 #Morpho
1413 #Structure
1414
Akron821db3d2017-04-06 21:19:31 +02001415 Sgbr
1416 #Lemma
1417 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001418
Akron7d5e6382019-08-08 16:36:27 +02001419 Talismane
1420 #Dependency
1421 #Morpho
1422
Akron821db3d2017-04-06 21:19:31 +02001423 TreeTagger
1424 #Morpho
1425 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001426
Akron821db3d2017-04-06 21:19:31 +02001427 XIP
1428 #Constituency
1429 #Morpho
1430 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001431
Akronc13a1702016-03-15 19:33:14 +01001432
1433More importers are in preparation.
1434New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1435See the built-in annotation importers as examples.
1436
Akronf73ffb62018-06-27 12:13:59 +02001437
Akron41e6c8b2021-10-14 20:22:18 +02001438=head1 METADATA SUPPORT
1439
1440L<KorAP::XML::Krill> has built-in importer for some meta data variants
1441developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1442
1443=over 2
1444
1445=item I5 - Meta data for all I5 files
1446
1447=item Sgbr - Meta data from the Schreibgebrauch project
1448
1449=item Gingko - Meta data from the Gingko project in addition to I5
1450
1451=back
1452
1453More importers are in preparation.
1454New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1455See the built-in meta data importers as examples.
1456
1457
Akron8f69d632020-01-15 16:58:11 +01001458=head1 About KorAP-XML
1459
1460KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1461data model (Bański et al. 2013), where text data are stored physically
1462separated from their interpretations (i.e. annotations).
1463A text document in KorAP-XML therefore consists of several files
1464containing primary data, metadata and annotations.
1465
1466The structure of a single KorAP-XML document can be as follows:
1467
1468 - data.xml
1469 - header.xml
1470 + base
1471 - tokens.xml
1472 - ...
1473 + struct
1474 - structure.xml
1475 - ...
1476 + corenlp
1477 - morpho.xml
1478 - constituency.xml
1479 - ...
1480 + tree_tagger
1481 - morpho.xml
1482 - ...
1483 - ...
1484
1485The C<data.xml> contains the primary data, the C<header.xml> contains
1486the metadata, and the annotation layers are stored in subfolders
1487like C<base>, C<struct> or C<corenlp>
1488(so-called "foundries"; Bański et al. 2013).
1489
1490Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001491(Lüngen and Sperberg-McQueen 2012). See the documentation in
1492L<KorAP::XML::Meta::I5> for translatable fields.
1493
1494Annotations correspond to a variant of the TEI-P5 feature structures
1495(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001496Annotation feature structures refer to character sequences of the primary text
1497inside the C<text> element of the C<data.xml>.
1498A single annotation containing the lemma of a token can have the following structure:
1499
1500 <span from="0" to="3">
1501 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1502 <f name="lex">
1503 <fs>
1504 <f name="lemma">zum</f>
1505 </fs>
1506 </f>
1507 </fs>
1508 </span>
1509
1510The C<from> and C<to> attributes are refering to the character span
1511in the primary text.
1512Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1513the structure may vary. See L<KorAP::XML::Annotation::*> for various
1514annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001515
1516Multiple KorAP-XML documents are organized on three levels following
1517the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1518corpus E<gt> document E<gt> text. On each level metadata information
1519can be stored, that C<korapxml2krill> will merge to a single metadata
1520object per text. A corpus is therefore structured as follows:
1521
1522 + <corpus>
1523 - header.xml
1524 + <document>
1525 - header.xml
1526 + <text>
1527 - data.xml
1528 - header.xml
1529 - ...
1530 - ...
1531
1532A single text can be identified by the concatenation of
1533the corpus identifier, the document identifier and the text identifier.
1534This identifier is called the text sigle
1535(e.g. a text with the identifier C<18486> in the document C<060> in the
1536corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1537
1538These corpora are often stored in zip files, with which C<korapxml2krill>
1539can deal with. Corpora may also be split in multiple zip archives
1540(e.g. one zip file per foundry), which is also supported (see C<--input>).
1541
1542Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1543in form of a test suite.
1544The resulting JSON format merges all annotation layers
1545based on a single token stream.
1546
1547=head2 References
1548
1549Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1550KorAP data model: first approximation, December.
1551
1552Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1553"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1554Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1555L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1556
1557Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1558"Robust corpus architecture: a new look at virtual collections and data access",
1559Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1560L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1561
1562Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1563Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1564"Towards an international standard on featurestructure representation",
1565Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1566pp. 373-376.
1567L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1568
1569Harald Lüngen and C. M. Sperberg-McQueen (2012):
1570"A TEI P5 Document Grammar for the IDS Text Model",
1571Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1572L<PDF|https://journals.openedition.org/jtei/pdf/508>
1573
1574TEI Consortium, eds:
1575"Feature Structures",
1576Guidelines for Electronic Text Encoding and Interchange.
1577L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1578
Akron941c1a62016-02-23 17:41:41 +01001579=head1 AVAILABILITY
1580
1581 https://github.com/KorAP/KorAP-XML-Krill
1582
1583
1584=head1 COPYRIGHT AND LICENSE
1585
Akron9a2545e2022-01-16 15:15:50 +01001586Copyright (C) 2015-2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001587
Akron6882d7d2021-02-08 09:43:57 +01001588Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001589
Akrona76d8352016-10-27 16:27:32 +02001590Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001591
Akron6882d7d2021-02-08 09:43:57 +01001592L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001593Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001594L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001595member of the
Akronf1849aa2019-12-16 23:35:33 +01001596L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001597
1598This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001599L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001600
1601=cut