blob: cb1863c41374d89e25f40e4ea812ae0b545938d8 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000012use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010013use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010014use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010015use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010018use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020019use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020020use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010021use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010022use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
Akronf1849aa2019-12-16 23:35:33 +0100146# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100149# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200150#
151# 2020/04/23
152# - Added support for Redewiedergabe-Korpus structure
153# annotations, based on sentence and paragraph milestones
154# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100155# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100156
Akronf8df2162020-08-07 15:03:39 +0200157our $LAST_CHANGE = '2020/08/07';
Akron941c1a62016-02-23 17:41:41 +0100158our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100159our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100160our $VERSION_MSG = <<"VERSION";
161Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
162VERSION
163
Akron941c1a62016-02-23 17:41:41 +0100164# Parse comand
165my $cmd;
166our @ARGV;
167if ($ARGV[0] && index($ARGV[0], '-') != 0) {
168 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100169};
Akron63f20d42017-04-10 23:40:29 +0200170my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100171
Akron5f51d422016-08-16 16:26:43 +0200172my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200173
174# Configuration hash
175my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100176
Akron941c1a62016-02-23 17:41:41 +0100177# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000178GetOptions(
Akron08385f62016-03-22 20:37:04 +0100179 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200180 'input-base|ib=s' => \($cfg{input_base}),
181 'output|o=s' => \($cfg{output}),
182 'overwrite|w' => \($cfg{overwrite}),
183 'meta|m=s' => \($cfg{meta}),
184 'token|t=s' => \($cfg{token}),
185 'base-sentences|bs=s' => \($cfg{base_sentences}),
186 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
187 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
188 'gzip|z' => \($cfg{gzip}),
189 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100190 'skip|s=s' => \@skip,
191 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200192 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200193 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200194 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200195 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100196 'primary|p!' => \(my $primary),
197 'pretty|y' => \(my $pretty),
Akronf8df2162020-08-07 15:03:39 +0200198 'jobs|j=i' => \($cfg{jobs}),
199 'koral|k=f' => \($cfg{koral}),
200 'to-tar' => \($cfg{to_tar}),
201 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
202 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
203 'sequential-extraction|se' => \($cfg{sequential_extraction}),
204 'cache-size|cs=s' => \($cfg{cache_size}),
205 'cache-delete|cd!' => \($cfg{cache_delete}),
206 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100207 'help|h' => sub {
208 pod2usage(
209 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200210 -verbose => 99,
211 -msg => $VERSION_MSG,
212 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100213 );
214 },
215 'version|v' => sub {
216 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200217 -verbose => 0,
218 -msg => $VERSION_MSG,
219 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100220 )
221 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000222);
223
Akron63f20d42017-04-10 23:40:29 +0200224
Akronf8df2162020-08-07 15:03:39 +0200225# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200226if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200227 my %config;
228
Akronf8df2162020-08-07 15:03:39 +0200229 print "Reading config from $cfg_file\n";
230
Akron636aa112017-04-07 18:48:56 +0200231 Config::Simple->import_from($cfg_file, \%config);
232
Akronf8df2162020-08-07 15:03:39 +0200233 foreach (qw!output cache-size input-base token overwrite
234 meta base-sentences base-paragraphs base-pagebreaks
235 gzip to-tar log cache non-word-tokens
236 non-verbal-tokens sequential-extraction cache-init
237 koral extract-dir jobs!) {
238 my $underlined = $_ =~ tr/-/_/r;
239 if (!defined($cfg{$underlined}) && defined $config{$_}) {
240 $cfg{$underlined} = $config{$_};
241 };
Akron636aa112017-04-07 18:48:56 +0200242 };
243
244 # Skip
245 if (!scalar(@skip) && defined $config{'skip'}) {
246 @skip = split /\s*;\s*/, $config{'skip'} ;
247 };
248
249 # Sigle
250 if (!scalar(@sigle) && defined $config{'sigle'}) {
251 @sigle = split /\s*;\s*/, $config{'sigle'} ;
252 };
253
254 # Anno
255 if (!scalar(@anno) && defined $config{'anno'}) {
256 @anno = split /\s*;\s*/, $config{'anno'} ;
257 };
258};
259
Akronf8df2162020-08-07 15:03:39 +0200260# Init variables and set default values
261my $output = $cfg{output};
262my $input_base = $cfg{input_base};
263my $gzip = $cfg{gzip};
264my $to_tar = $cfg{to_tar};
265my $extract_dir = $cfg{extract_dir};
266my $token_base = $cfg{token} // 'OpenNLP#tokens';
267my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
268my $jobs = $cfg{jobs} // 0;
269my $cache_delete = $cfg{cache_delete} // 1;
270my $base_sentences = lc($cfg{base_sentences} // '');
271my $base_paragraphs = lc($cfg{base_paragraphs} // '');
272my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
273my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200274
Akronf8df2162020-08-07 15:03:39 +0200275# Get tokenization basis
276my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200277
Akronf8df2162020-08-07 15:03:39 +0200278# Remove file extension
279$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100280
Akronf8df2162020-08-07 15:03:39 +0200281# Convert sigle to path construct
282s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
283
284my %skip;
285$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200286
287# Initialize log4perl object
288Log::Log4perl->init({
Akronf8df2162020-08-07 15:03:39 +0200289 'log4perl.rootLogger' => uc($cfg{log} // 'ERROR') . ', STDERR',
Akron63f20d42017-04-10 23:40:29 +0200290 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
291 'log4perl.appender.STDERR.layout' => 'PatternLayout',
292 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
293});
294
295my $log = Log::Log4perl->get_logger('main');
296
Akronf8df2162020-08-07 15:03:39 +0200297if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
298 $log->error("Directory '$output' does not exist.");
299 exit 1;
300};
Akron63f20d42017-04-10 23:40:29 +0200301
Akron941c1a62016-02-23 17:41:41 +0100302my %ERROR_HASH = (
303 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200304 -verbose => 99,
305 -msg => $VERSION_MSG,
306 -output => '-',
307 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100308);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000309
Akron941c1a62016-02-23 17:41:41 +0100310# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100311pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000312
Akrone1dbc382016-07-08 22:24:52 +0200313# Gzip has no effect, if no output is given
314pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000315
Akronc11f7982017-02-21 21:20:14 +0100316
Akronf8df2162020-08-07 15:03:39 +0200317# Auto adjust jobs
Akron636aa112017-04-07 18:48:56 +0200318if ($jobs eq '-1') {
Akronf8df2162020-08-07 15:03:39 +0200319 my $cores = Sys::Info->new->device('CPU')->count;
Akronc11f7982017-02-21 21:20:14 +0100320 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200321 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100322};
323
Akron821db3d2017-04-06 21:19:31 +0200324
Akron63f20d42017-04-10 23:40:29 +0200325# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200326if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200327
Akron486f9ab2017-04-22 23:25:19 +0200328 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200329 $log->error("Directory '$output' does not exist.");
330 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200331 };
332
333 # Remove all inputs
334 my $remove_next = 0;
335 @keep_argv = @{c(@keep_argv)->grep(
336 sub {
337 # Input flag
338 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
339 $remove_next = 1;
340 return 0;
341 }
342
343 # input value
344 elsif ($remove_next) {
345 $remove_next = 0;
346 return 0;
347 };
348
349 # Pass parameter
350 return 1;
351 }
352 )->to_array};
353
354
355 # Iterate over all inputs
356 foreach (@input) {
357
Akron081639e2017-04-21 19:01:39 +0200358 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200359 my $new_out = catdir($output, get_file_name_from_glob($_));
360
Akron486f9ab2017-04-22 23:25:19 +0200361 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200362 unless ($to_tar) {
363 if (make_path($new_out) == 0 && !-d $new_out) {
364 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200365 exit 1;
Akron081639e2017-04-21 19:01:39 +0200366 };
Akron63f20d42017-04-10 23:40:29 +0200367 };
368
369 # Create archive command
370 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
371 print "Start serial processing of $_ to $new_out\n";
372
373 # Start archiving
374 system @archive_cmd;
375 };
376
Akron3abc03e2017-06-29 16:23:35 +0200377 exit;
Akron63f20d42017-04-10 23:40:29 +0200378};
379
Akrone1dbc382016-07-08 22:24:52 +0200380my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100381push(@layers, ['Base', 'Sentences']) unless $base_sentences;
382push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200383
384# Connexor
385push(@layers, ['Connexor', 'Morpho']);
386push(@layers, ['Connexor', 'Syntax']);
387push(@layers, ['Connexor', 'Phrase']);
388push(@layers, ['Connexor', 'Sentences']);
389
390# CoreNLP
391push(@layers, ['CoreNLP', 'NamedEntities']);
392push(@layers, ['CoreNLP', 'Sentences']);
393push(@layers, ['CoreNLP', 'Morpho']);
394push(@layers, ['CoreNLP', 'Constituency']);
395
Akronce125b62017-06-19 11:54:36 +0200396# CMC
397push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100398
Akrone1dbc382016-07-08 22:24:52 +0200399# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100400my @dereko_attr = ();
401if ($base_sentences eq 'dereko#structure') {
402 push @dereko_attr, 'sentences';
403};
404if ($base_paragraphs eq 'dereko#structure') {
405 push @dereko_attr, 'paragraphs';
406};
Akron636bd9c2017-02-09 17:13:00 +0100407
Akron41ac10b2017-02-08 22:47:25 +0100408if ($base_pagebreaks eq 'dereko#structure') {
409 push @dereko_attr, 'pagebreaks';
410};
411
412if ($dereko_attr[0]) {
413 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100414}
415else {
416 push(@layers, ['DeReKo', 'Structure']);
417};
Akrone1dbc382016-07-08 22:24:52 +0200418
Akron57510c12019-01-04 14:58:53 +0100419# DGD
420push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100421if ($base_sentences eq 'dgd#structure') {
422 push(@layers, ['DGD', 'Structure', 'base-sentence']);
423}
Akron57510c12019-01-04 14:58:53 +0100424
425# DRuKoLa
426push(@layers, ['DRuKoLa', 'Morpho']);
427
Akrone1dbc382016-07-08 22:24:52 +0200428# Glemm
429push(@layers, ['Glemm', 'Morpho']);
430
Akronea1aed52018-07-19 14:43:34 +0200431# HNC
432push(@layers, ['HNC', 'Morpho']);
433
Akron4c679192018-01-16 17:41:49 +0100434# LWC
435push(@layers, ['LWC', 'Dependency']);
436
Akrone1dbc382016-07-08 22:24:52 +0200437# Malt
438push(@layers, ['Malt', 'Dependency']);
439
Akron57510c12019-01-04 14:58:53 +0100440# Marmot
441push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200442
443# Mate
444push(@layers, ['Mate', 'Morpho']);
445push(@layers, ['Mate', 'Dependency']);
446
Akron57510c12019-01-04 14:58:53 +0100447# MDParser
448push(@layers, ['MDParser', 'Dependency']);
449
Akrone1dbc382016-07-08 22:24:52 +0200450# OpenNLP
451push(@layers, ['OpenNLP', 'Morpho']);
452push(@layers, ['OpenNLP', 'Sentences']);
453
Akron07e24772020-04-23 14:00:54 +0200454# Redewiedergabe
455push(@layers, ['RWK', 'Morpho']);
456if ($base_sentences eq 'rwk#structure') {
457 push(@layers, ['RWK', 'Structure']);
458};
459
Akrone1dbc382016-07-08 22:24:52 +0200460# Schreibgebrauch
461push(@layers, ['Sgbr', 'Lemma']);
462push(@layers, ['Sgbr', 'Morpho']);
463
Akron7d5e6382019-08-08 16:36:27 +0200464# Talismane
465push(@layers, ['Talismane', 'Dependency']);
466push(@layers, ['Talismane', 'Morpho']);
467
Akrone1dbc382016-07-08 22:24:52 +0200468# TreeTagger
469push(@layers, ['TreeTagger', 'Morpho']);
470push(@layers, ['TreeTagger', 'Sentences']);
471
472# XIP
473push(@layers, ['XIP', 'Morpho']);
474push(@layers, ['XIP', 'Constituency']);
475push(@layers, ['XIP', 'Sentences']);
476push(@layers, ['XIP', 'Dependency']);
477
Akron4fa37c32017-01-20 14:43:10 +0100478
Akrone1dbc382016-07-08 22:24:52 +0200479# Check filters
480my @filtered_anno;
481if ($skip{'#all'}) {
482 foreach (@anno) {
483 push @filtered_anno, [ split('#', $_) ];
484 };
485}
486
487# Add all annotations that are not skipped
488else {
489 # Add to index file - respect skipping
490 foreach my $info (@layers) {
491 # Skip if Foundry or Foundry#Layer should be skipped
492 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
493 push @filtered_anno, $info;
494 };
495 };
496};
497
Akrone1dbc382016-07-08 22:24:52 +0200498
499# TODO: This should not be initialized for batch
500my $cache = Cache::FastMmap->new(
501 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200502 cache_size => ($cfg{cache_size} // '50m'),
503 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200504);
505
Akron03b24db2016-08-16 20:54:32 +0200506# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200507my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200508 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200509 meta_type => $cfg{meta},
510 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200511 foundry => $token_base_foundry,
512 layer => $token_base_layer,
513 gzip => $gzip,
514 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200515 koral => ($cfg{koral} // $KORAL_VERSION),
Akron03b24db2016-08-16 20:54:32 +0200516 primary => $primary,
517 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100518 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200519 non_word_tokens => ($cfg{non_word_tokens} // 0),
520 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200521);
522
Akron63f20d42017-04-10 23:40:29 +0200523# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200524if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200525
Akron821db3d2017-04-06 21:19:31 +0200526 my @new_input = ();
527
528 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200529 foreach my $wild_card (@input) {
530
531 # Prefix with input root
532 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
533
534 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200535 };
536
Akron63f20d42017-04-10 23:40:29 +0200537 # Sort files by length
538 @input = sort { length($a) <=> length($b) } @new_input;
539
540 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200541};
542
543
Akron941c1a62016-02-23 17:41:41 +0100544# Process a single file
545unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100546 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000547
Akron941c1a62016-02-23 17:41:41 +0100548 BEGIN {
549 $main::TIME = Benchmark->new;
550 $main::LAST_STOP = Benchmark->new;
551 };
552
553 sub stop_time {
554 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200555 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100556 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200557 timestr(timediff($new, $main::LAST_STOP)) .
558 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
559 );
Akron941c1a62016-02-23 17:41:41 +0100560 $main::LAST_STOP = $new;
561 };
562
563 # Create and parse new document
564 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100565
Akron7d4cdd82016-08-17 21:39:45 +0200566 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200567 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100568
Akron11c80302016-03-18 19:44:43 +0100569 # Delete cache file
570 unlink($cache_file) if $cache_delete;
571
Akron5f51d422016-08-16 16:26:43 +0200572 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200573 exit;
Akron81500102017-04-07 20:45:44 +0200574};
575
Nils Diewald59094f22014-11-05 18:20:50 +0000576
Akrone10ad322016-02-27 10:54:26 +0100577# Extract XML files
Akron81500102017-04-07 20:45:44 +0200578if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100579
Akrond5643ad2017-07-04 20:27:13 +0200580 # Output is required
581 pod2usage(%ERROR_HASH) unless $output;
582
Akron7d4cdd82016-08-17 21:39:45 +0200583 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200584 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100585
Akron7d4cdd82016-08-17 21:39:45 +0200586 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100587 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200588 $log->error("Unzip is not installed or incompatible.");
589 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100590 };
591
Akronb0c88db2016-06-29 16:33:18 +0200592 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200593 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200594
Akron31a08cb2019-02-20 20:43:26 +0100595 # Will set @sigle
596 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200597
Akrone10ad322016-02-27 10:54:26 +0100598 # Iterate over all given sigles and extract
599 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100600
Akron2812ba22016-10-28 21:55:59 +0200601 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200602
Akron03b24db2016-08-16 20:54:32 +0200603 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200604 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100605
Akron955b75b2019-02-21 14:28:41 +0100606 # TODO:
607 # - prefix???
608 $archive->extract_sigle([$_], $output, $jobs)
609 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200610 );
Akrone10ad322016-02-27 10:54:26 +0100611 print "extracted.\n";
612 };
Akronb0c88db2016-06-29 16:33:18 +0200613 }
Akron7d4cdd82016-08-17 21:39:45 +0200614
615 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200616 else {
617 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200618 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100619 };
620}
621
Akron81500102017-04-07 20:45:44 +0200622
Akron941c1a62016-02-23 17:41:41 +0100623# Process an archive
624elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000625
Akron81500102017-04-07 20:45:44 +0200626 my $archive_output;
627
628 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100629 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200630
631 # Create new archive object
632 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
633
634 # Check zip capabilities
635 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200636 $log->error("Unzip is not installed or incompatible.");
637 exit 1;
Akron81500102017-04-07 20:45:44 +0200638 };
639
640 # Add further annotation archived
641 $archive->attach($_) foreach @input[1..$#input];
642
643 # Create a temporary directory
644 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200645 $extract_dir = tempdir(CLEANUP => 0);
646 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200647 };
648
Akron63f20d42017-04-10 23:40:29 +0200649 # Add some random extra to avoid clashes with multiple archives
650 $extract_dir = catdir($extract_dir, random_string('cccccc'));
651
Akron31a08cb2019-02-20 20:43:26 +0100652 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200653 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200654 @input = ($extract_dir);
655 }
656 else {
657 $log->error('Unable to extract from primary archive ' . $input[0] .
658 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200659 exit 1;
Akron81500102017-04-07 20:45:44 +0200660 };
661 }
662
663 # Can't create archive object
664 else {
665 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200666 exit 1;
Akron81500102017-04-07 20:45:44 +0200667 };
668 };
669
Akron7d4cdd82016-08-17 21:39:45 +0200670 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100671 my $pool = Parallel::ForkManager->new($jobs);
672
Akron7d4cdd82016-08-17 21:39:45 +0200673 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100674 my $iter = 1; # Current text in process
675
Akronda3097e2017-04-23 19:53:57 +0200676 my $tar_archive;
677 my $output_dir = $output;
678 my $tar_fh;
679
680 # Initialize tar archive
681 if ($to_tar) {
682 $tar_archive = Archive::Tar::Builder->new(
683 ignore_errors => 1
684 );
685
686 # Set output name
687 my $tar_file = $output;
688 unless ($tar_file =~ /\.tar$/) {
689 $tar_file .= '.tar';
690 };
691
692 # Initiate the tar file
693 print "Writing to file $tar_file\n";
694 $tar_fh = IO::File->new($tar_file, 'w');
695 $tar_fh->binmode(1);
696
697 # Set handle
698 $tar_archive->set_handle($tar_fh);
699
700 # Output to temporary directory
701 $output_dir = File::Temp->newdir;
702 };
703
Akron941c1a62016-02-23 17:41:41 +0100704 # Report on fork message
705 $pool->run_on_finish (
706 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200707 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100708 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200709
Akron08385f62016-03-22 20:37:04 +0100710 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200711 ($iter++) . "/$count]" .
712 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200713 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200714
715 if (!$code && $to_tar && $data->[2]) {
716 my $filename = $data->[2];
717
718 # Lock filehandle
719 if (flock($tar_fh, LOCK_EX)) {
720
Akron9a062ce2017-07-04 19:12:05 +0200721 my $clean_file = fileparse($filename);
722
Akronda3097e2017-04-23 19:53:57 +0200723 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200724 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200725 unlink $filename;
726
727 # Unlock filehandle
728 flock($tar_fh, LOCK_UN);
729 }
730 else {
731 $log->warn("Unable to add $filename to archive");
732 };
733 };
734
Akron4c0cf312016-10-15 16:42:09 +0200735 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100736 }
737 );
738
739 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200740 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100741 print "Reading data ...\n";
742
Akron7d4cdd82016-08-17 21:39:45 +0200743 # unless (Cache::FastMmap->new(
744 # share_file => $cache_file,
745 # cache_size => $cache_size,
746 # init_file => $cache_init
747 # )) {
748 # print "Unable to intialize cache '$cache_file'\n\n";
749 # exit(1);
750 # };
Akron11c80302016-03-18 19:44:43 +0100751
Akron486f9ab2017-04-22 23:25:19 +0200752
Akron941c1a62016-02-23 17:41:41 +0100753 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100754 if (-d $input[0]) {
755 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100756 my @dirs;
757 my $dir;
758
Akron7d4cdd82016-08-17 21:39:45 +0200759 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100760 while (1) {
761 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200762 push @dirs, $dir;
763 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100764 };
765 last unless $it->next;
766 };
767
768 print "Start processing ...\n";
769 $t = Benchmark->new;
770 $count = scalar @dirs;
771
772 DIRECTORY_LOOP:
773 for (my $i = 0; $i < $count; $i++) {
774
Akrone1dbc382016-07-08 22:24:52 +0200775 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200776 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200777 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200778 );
Akron941c1a62016-02-23 17:41:41 +0100779
780 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200781 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200782
Akron13d56622016-10-31 14:54:49 +0100783 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200784 $pool->finish(
785 0,
Akronda3097e2017-04-23 19:53:57 +0200786 [
787 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
788 undef,
789 $filename
790 ]
Akron486f9ab2017-04-22 23:25:19 +0200791 );
Akron3ec48972016-08-17 23:24:52 +0200792 }
793 else {
Akron4c0cf312016-10-15 16:42:09 +0200794 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200795 };
Akron941c1a62016-02-23 17:41:41 +0100796 };
797 }
798
799 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200800 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200801
Akron941c1a62016-02-23 17:41:41 +0100802 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200803 $log->error("Unzip is not installed or incompatible.");
804 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100805 };
806
Akron08385f62016-03-22 20:37:04 +0100807 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200808 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100809
Akron31a08cb2019-02-20 20:43:26 +0100810 # Get sigles to extract
811 my $prefix = set_sigle($archive);
812
Akron941c1a62016-02-23 17:41:41 +0100813 print "Start processing ...\n";
814 $t = Benchmark->new;
815 my @dirs = $archive->list_texts;
816 $count = scalar @dirs;
817
818 ARCHIVE_LOOP:
819 for (my $i = 0; $i < $count; $i++) {
820
821 # Split path information
822 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
823
Akrone1dbc382016-07-08 22:24:52 +0200824 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200825 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200826 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200827 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200828 catfile($corpus, $doc, $text)
829 . '.json' . ($gzip ? '.gz' : '')
830 )
Akrone1dbc382016-07-08 22:24:52 +0200831 );
Akron941c1a62016-02-23 17:41:41 +0100832
833 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200834 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100835
Akron4c0cf312016-10-15 16:42:09 +0200836 # Create temporary file
837 $temp = File::Temp->newdir;
838
Akronbdf434a2016-10-24 17:42:07 +0200839 # TODO: Check if $filename exist at the beginning,
840 # because extraction can be horrible slow!
841
Akron941c1a62016-02-23 17:41:41 +0100842 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100843 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100844
Akron7d4cdd82016-08-17 21:39:45 +0200845 # Create corpus directory
846 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100847
Akron7d4cdd82016-08-17 21:39:45 +0200848 # Temporary directory
849 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100850
Akron7d4cdd82016-08-17 21:39:45 +0200851 # Write file
Akron13d56622016-10-31 14:54:49 +0100852 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200853
Akron4c0cf312016-10-15 16:42:09 +0200854 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100855 $pool->finish(
856 0,
Akronda3097e2017-04-23 19:53:57 +0200857 [
858 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
859 $temp,
860 $filename
861 ]
Akron13d56622016-10-31 14:54:49 +0100862 );
863 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200864 }
865 else {
Akron4c0cf312016-10-15 16:42:09 +0200866 # Delete temporary file
867 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200868 };
Akron941c1a62016-02-23 17:41:41 +0100869 }
Akron7d4cdd82016-08-17 21:39:45 +0200870
871 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100872 else {
Akron4c0cf312016-10-15 16:42:09 +0200873 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100874 };
875 };
876 }
877
878 else {
879 print "Input is neither a directory nor an archive.\n\n";
880 };
881
882 $pool->wait_all_children;
883
Akron11c80302016-03-18 19:44:43 +0100884 # Delete cache file
885 unlink($cache_file) if $cache_delete;
886
Akronda3097e2017-04-23 19:53:57 +0200887 # Close tar filehandle
888 if ($to_tar && $tar_fh) {
889 $tar_archive->finish;
890 $tar_fh->close;
891 print "Wrote to tar archive.\n";
892 };
893
Akron63f20d42017-04-10 23:40:29 +0200894 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100895 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200896};
Akron941c1a62016-02-23 17:41:41 +0100897
Nils Diewald2db9ad02013-10-29 19:26:43 +0000898
Akron31a08cb2019-02-20 20:43:26 +0100899# For an archive, this will create the list
900# of all sigles to process
901sub set_sigle {
902 my $archive = shift;
903
904 my $prefix = 1;
905 my @dirs = ();
906
907 # No sigles given
908 unless (@sigle) {
909
910 # Get files
911 foreach ($archive->list_texts) {
912
913 push @dirs, $_;
914
915 # Split path information
916 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
917
918 # TODO: Make this OS independent
919 push @sigle, join '/', $corpus, $doc, $text;
920 };
921 }
922
923 # Check sigle for doc sigles
924 else {
925 my @new_sigle;
926
927 my $prefix_check = 0;
928
929 # Iterate over all sigle
930 foreach (@sigle) {
931
932 # Sigle is a doc sigle
933 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
934
935 print "$_ ...";
936 # Check if a prefix is needed
937 unless ($prefix_check) {
938
939 if ($prefix = $archive->check_prefix) {
940 print " with prefix ...";
941 };
942 $prefix_check = 1;
943 };
944
945 print "\n";
946
Akron31a08cb2019-02-20 20:43:26 +0100947 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100948 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
949 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100950 );
951 print "extracted.\n";
952 }
953
954 # Sigle is a text sigle
955 else {
956 push @new_sigle, $_;
957
958 unless ($prefix_check) {
959
960 if ($prefix = $archive->check_prefix) {
961 print " with prefix ...";
962 };
963 $prefix_check = 1;
964 };
965 };
966 };
967 @sigle = @new_sigle;
968 };
969
970 return $prefix;
971};
972
973
Akron63f20d42017-04-10 23:40:29 +0200974# Cleanup temporary extraction directory
975if ($extract_dir) {
976 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +0200977 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +0200978};
979
980
981print "\n";
982
Nils Diewald2db9ad02013-10-29 19:26:43 +0000983__END__
Akron941c1a62016-02-23 17:41:41 +0100984
985=pod
986
987=encoding utf8
988
989=head1 NAME
990
Akron42f48c12020-02-14 13:08:13 +0100991korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100992
993
994=head1 SYNOPSIS
995
Akrona76d8352016-10-27 16:27:32 +0200996 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100997
Akron2fd402b2016-10-27 21:26:48 +0200998
Akron941c1a62016-02-23 17:41:41 +0100999=head1 DESCRIPTION
1000
1001L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1002compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001003The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001004
1005
1006=head1 INSTALLATION
1007
1008The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1009
Akronaf386982016-10-12 00:33:25 +02001010 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001011
Akronc13a1702016-03-15 19:33:14 +01001012In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001013be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001014Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001015In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001016
1017=head1 ARGUMENTS
1018
Akrona76d8352016-10-27 16:27:32 +02001019 $ korapxml2krill -z --input <directory> --output <filename>
1020
1021Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001022It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001023
Akron941c1a62016-02-23 17:41:41 +01001024=over 2
1025
1026=item B<archive>
1027
Akron081639e2017-04-21 19:01:39 +02001028 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001029
Akron2fd402b2016-10-27 21:26:48 +02001030Converts an archive of KorAP-XML documents. It expects a directory
1031(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001032
1033=item B<extract>
1034
Akrona76d8352016-10-27 16:27:32 +02001035 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1036
1037Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001038
Akron63f20d42017-04-10 23:40:29 +02001039=item B<serial>
1040
1041 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1042
1043Convert archives sequentially. The inputs are not merged but treated
1044as they are (so they may be premerged or globs).
1045the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001046are created based on the archive name. In case the C<--to-tar> flag is given,
1047the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001048
1049
Akron941c1a62016-02-23 17:41:41 +01001050=back
1051
1052
1053=head1 OPTIONS
1054
1055=over 2
1056
Akrona76d8352016-10-27 16:27:32 +02001057=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001058
Akrona76d8352016-10-27 16:27:32 +02001059Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001060
Akron7606afa2016-10-25 16:23:49 +02001061Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001062document, while C<archive> expects a KorAP-XML corpus folder or a zip
1063file to batch process multiple files.
1064C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001065
Akrona76d8352016-10-27 16:27:32 +02001066C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001067that the first archive listed contains all primary data files
1068and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001069
Akron7606afa2016-10-25 16:23:49 +02001070 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001071
Akron821db3d2017-04-06 21:19:31 +02001072Input may also be defined using BSD glob wildcards.
1073
1074 -i 'file/news*.zip'
1075
1076The extended input array will be sorted in length order, so the shortest
1077path needs to contain all primary data files and all meta data files.
1078
Akron0c3e3752016-06-28 15:55:53 +02001079(The directory structure follows the base directory format,
1080that may include a C<.> root folder.
1081In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001082need to be passed with a hash sign in front of the archive's name.
1083This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001084
Akron7606afa2016-10-25 16:23:49 +02001085To support zip files, a version of C<unzip> needs to be installed that is
1086compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001087
Akron7606afa2016-10-25 16:23:49 +02001088B<The root folder switch using the hash sign is experimental and
1089may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001090
Akronf73ffb62018-06-27 12:13:59 +02001091
Akron63f20d42017-04-10 23:40:29 +02001092=item B<--input-base|-ib> <directory>
1093
1094The base directory for inputs.
1095
1096
Akron941c1a62016-02-23 17:41:41 +01001097=item B<--output|-o> <directory|file>
1098
1099Output folder for archive processing or
1100document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001101writes to C<STDOUT> by default
1102(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001103
1104=item B<--overwrite|-w>
1105
1106Overwrite files that already exist.
1107
Akronf73ffb62018-06-27 12:13:59 +02001108
Akron3741f8b2016-12-21 19:55:21 +01001109=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001110
1111Define the default tokenization by specifying
1112the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001113of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001114This will directly take the file instead of running
1115the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001116
Akron3741f8b2016-12-21 19:55:21 +01001117
1118=item B<--base-sentences|-bs> <foundry>#<layer>
1119
1120Define the layer for base sentences.
1121If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001122Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1123layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001124
1125 Defaults to unset.
1126
1127
1128=item B<--base-paragraphs|-bp> <foundry>#<layer>
1129
1130Define the layer for base paragraphs.
1131If given, this will be used instead of using C<Base#Paragraphs>.
1132Currently C<DeReKo#Structure> is the only additional layer supported.
1133
1134 Defaults to unset.
1135
1136
Akron41ac10b2017-02-08 22:47:25 +01001137=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1138
1139Define the layer for base pagebreaks.
1140Currently C<DeReKo#Structure> is the only layer supported.
1141
1142 Defaults to unset.
1143
1144
Akron941c1a62016-02-23 17:41:41 +01001145=item B<--skip|-s> <foundry>[#<layer>]
1146
Akronf7ad89e2016-03-16 18:22:47 +01001147Skip specific annotations by specifying the foundry
1148(and optionally the layer with a C<#>-prefix),
1149e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001150Can be set multiple times.
1151
Akronf73ffb62018-06-27 12:13:59 +02001152
Akronc13a1702016-03-15 19:33:14 +01001153=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001154
Akronf7ad89e2016-03-16 18:22:47 +01001155Convert specific annotations by specifying the foundry
1156(and optionally the layer with a C<#>-prefix),
1157e.g. C<Mate> or C<Mate#Morpho>.
1158Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001159
Akronf73ffb62018-06-27 12:13:59 +02001160
Akron941c1a62016-02-23 17:41:41 +01001161=item B<--primary|-p>
1162
Akronc13a1702016-03-15 19:33:14 +01001163Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001164Can be flagged using C<--no-primary> as well.
1165This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001166
Akronf73ffb62018-06-27 12:13:59 +02001167
Akroned9baf02019-01-22 17:03:25 +01001168=item B<--non-word-tokens|-nwt>
1169
1170Tokenize non-word tokens like word tokens (defined as matching
1171C</[\d\w]/>). Useful to treat punctuations as tokens.
1172
1173 Defaults to unset.
1174
Akronf1849aa2019-12-16 23:35:33 +01001175
1176=item B<--non-verbal-tokens|-nvt>
1177
1178Tokenize non-verbal tokens marked as in the primary data as
1179the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1180
1181 Defaults to unset.
1182
1183
Akron941c1a62016-02-23 17:41:41 +01001184=item B<--jobs|-j>
1185
1186Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001187for archive processing.
Akron11c80302016-03-18 19:44:43 +01001188Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001189
1190If C<sequential-extraction> is not set to false, this will
1191also apply to extraction.
1192
Akronc11f7982017-02-21 21:20:14 +01001193Pass -1, and the value will be set automatically to 5
1194times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001195This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001196
Akronf73ffb62018-06-27 12:13:59 +02001197
Akron263274c2019-02-07 09:48:30 +01001198=item B<--koral|-k>
1199
1200Version of the output format. Supported versions are:
1201C<0> for legacy serialization, C<0.03> for serialization
1202with metadata fields as key-values on the root object,
1203C<0.4> for serialization with metadata fields as a list
1204of C<"@type":"koral:field"> objects.
1205
1206Currently defaults to C<0.03>.
1207
1208
Akron9ec88872017-04-12 16:29:06 +02001209=item B<--sequential-extraction|-se>
1210
1211Flag to indicate, if the C<jobs> value also applies to extraction.
1212Some systems may have problems with extracting multiple archives
1213to the same folder at the same time.
1214Can be flagged using C<--no-sequential-extraction> as well.
1215Defaults to C<false>.
1216
Akronf73ffb62018-06-27 12:13:59 +02001217
Akron35db6e32016-03-17 22:42:22 +01001218=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001219
Akron35db6e32016-03-17 22:42:22 +01001220Define the metadata parser to use. Defaults to C<I5>.
1221Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1222This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001223
Akronf73ffb62018-06-27 12:13:59 +02001224
Akron941c1a62016-02-23 17:41:41 +01001225=item B<--pretty|-y>
1226
Akronc13a1702016-03-15 19:33:14 +01001227Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001228This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001229
Akronf73ffb62018-06-27 12:13:59 +02001230
Akron941c1a62016-02-23 17:41:41 +01001231=item B<--gzip|-z>
1232
Akronf7ad89e2016-03-16 18:22:47 +01001233Compress the output.
1234Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001235
Akronf73ffb62018-06-27 12:13:59 +02001236
Akron11c80302016-03-18 19:44:43 +01001237=item B<--cache|-c>
1238
1239File to mmap a cache (using L<Cache::FastMmap>).
1240Defaults to C<korapxml2krill.cache> in the calling directory.
1241
Akronf73ffb62018-06-27 12:13:59 +02001242
Akron11c80302016-03-18 19:44:43 +01001243=item B<--cache-size|-cs>
1244
1245Size of the cache. Defaults to C<50m>.
1246
Akronf73ffb62018-06-27 12:13:59 +02001247
Akron11c80302016-03-18 19:44:43 +01001248=item B<--cache-init|-ci>
1249
1250Initialize cache file.
1251Can be flagged using C<--no-cache-init> as well.
1252Defaults to C<true>.
1253
Akronf73ffb62018-06-27 12:13:59 +02001254
Akron11c80302016-03-18 19:44:43 +01001255=item B<--cache-delete|-cd>
1256
1257Delete cache file after processing.
1258Can be flagged using C<--no-cache-delete> as well.
1259Defaults to C<true>.
1260
Akronf73ffb62018-06-27 12:13:59 +02001261
Akron636aa112017-04-07 18:48:56 +02001262=item B<--config|-cfg>
1263
1264Configure the parameters of your call in a file
1265of key-value pairs with whitespace separator
1266
1267 overwrite 1
1268 token DeReKo#Structure
1269 ...
1270
1271Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001272C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001273C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001274C<output>, C<koral>,
1275C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001276C<base-sentences>, C<base-paragraphs>,
1277C<base-pagebreaks>,
1278C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001279(semicolon separated), C<anno> (semicolon separated).
1280
Akronf73ffb62018-06-27 12:13:59 +02001281Configuration parameters will always be overwritten by
1282passed parameters.
1283
1284
Akron81500102017-04-07 20:45:44 +02001285=item B<--temporary-extract|-te>
1286
1287Only valid for the C<archive> command.
1288
1289This will first extract all files into a
1290directory and then will archive.
1291If the directory is given as C<:temp:>,
1292a temporary directory is used.
1293This is especially useful to avoid
1294massive unzipping and potential
1295network latency.
Akron636aa112017-04-07 18:48:56 +02001296
Akronf73ffb62018-06-27 12:13:59 +02001297
Akronc93a0802019-07-11 15:48:34 +02001298=item B<--to-tar>
1299
1300Only valid for the C<archive> command.
1301
1302Writes the output into a tar archive.
1303
1304
Akrone10ad322016-02-27 10:54:26 +01001305=item B<--sigle|-sg>
1306
Akron20807582016-10-26 17:11:34 +02001307Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001308Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001309I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001310Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001311In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001312On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001313
Akronf73ffb62018-06-27 12:13:59 +02001314
Akron941c1a62016-02-23 17:41:41 +01001315=item B<--log|-l>
1316
1317The L<Log4perl> log level, defaults to C<ERROR>.
1318
Akronf73ffb62018-06-27 12:13:59 +02001319
Akron941c1a62016-02-23 17:41:41 +01001320=item B<--help|-h>
1321
Akron42f48c12020-02-14 13:08:13 +01001322Print help information.
Akron941c1a62016-02-23 17:41:41 +01001323
Akronf73ffb62018-06-27 12:13:59 +02001324
Akron941c1a62016-02-23 17:41:41 +01001325=item B<--version|-v>
1326
1327Print version information.
1328
1329=back
1330
Akronf73ffb62018-06-27 12:13:59 +02001331
Akronc13a1702016-03-15 19:33:14 +01001332=head1 ANNOTATION SUPPORT
1333
1334L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1335developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1336The base foundry with paragraphs, sentences, and the text element are mandatory for
1337L<Krill|https://github.com/KorAP/Krill>.
1338
Akron821db3d2017-04-06 21:19:31 +02001339 Base
1340 #Paragraphs
1341 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001342
Akron821db3d2017-04-06 21:19:31 +02001343 Connexor
1344 #Morpho
1345 #Phrase
1346 #Sentences
1347 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001348
Akron821db3d2017-04-06 21:19:31 +02001349 CoreNLP
1350 #Constituency
1351 #Morpho
1352 #NamedEntities
1353 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001354
Akronce125b62017-06-19 11:54:36 +02001355 CMC
1356 #Morpho
1357
Akron821db3d2017-04-06 21:19:31 +02001358 DeReKo
1359 #Structure
Akronc13a1702016-03-15 19:33:14 +01001360
Akron57510c12019-01-04 14:58:53 +01001361 DGD
1362 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001363 #Structure
Akron57510c12019-01-04 14:58:53 +01001364
Akron821db3d2017-04-06 21:19:31 +02001365 DRuKoLa
1366 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001367
Akron821db3d2017-04-06 21:19:31 +02001368 Glemm
1369 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001370
Akronea1aed52018-07-19 14:43:34 +02001371 HNC
1372 #Morpho
1373
Akron4c679192018-01-16 17:41:49 +01001374 LWC
1375 #Dependency
1376
Akron821db3d2017-04-06 21:19:31 +02001377 Malt
1378 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001379
Akron821db3d2017-04-06 21:19:31 +02001380 MarMoT
1381 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001382
Akron821db3d2017-04-06 21:19:31 +02001383 Mate
1384 #Dependency
1385 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001386
Akron821db3d2017-04-06 21:19:31 +02001387 MDParser
1388 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001389
Akron821db3d2017-04-06 21:19:31 +02001390 OpenNLP
1391 #Morpho
1392 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001393
Akron07e24772020-04-23 14:00:54 +02001394 RWK
1395 #Morpho
1396 #Structure
1397
Akron821db3d2017-04-06 21:19:31 +02001398 Sgbr
1399 #Lemma
1400 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001401
Akron7d5e6382019-08-08 16:36:27 +02001402 Talismane
1403 #Dependency
1404 #Morpho
1405
Akron821db3d2017-04-06 21:19:31 +02001406 TreeTagger
1407 #Morpho
1408 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001409
Akron821db3d2017-04-06 21:19:31 +02001410 XIP
1411 #Constituency
1412 #Morpho
1413 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001414
Akronc13a1702016-03-15 19:33:14 +01001415
1416More importers are in preparation.
1417New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1418See the built-in annotation importers as examples.
1419
Akronf73ffb62018-06-27 12:13:59 +02001420
Akron8f69d632020-01-15 16:58:11 +01001421=head1 About KorAP-XML
1422
1423KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1424data model (Bański et al. 2013), where text data are stored physically
1425separated from their interpretations (i.e. annotations).
1426A text document in KorAP-XML therefore consists of several files
1427containing primary data, metadata and annotations.
1428
1429The structure of a single KorAP-XML document can be as follows:
1430
1431 - data.xml
1432 - header.xml
1433 + base
1434 - tokens.xml
1435 - ...
1436 + struct
1437 - structure.xml
1438 - ...
1439 + corenlp
1440 - morpho.xml
1441 - constituency.xml
1442 - ...
1443 + tree_tagger
1444 - morpho.xml
1445 - ...
1446 - ...
1447
1448The C<data.xml> contains the primary data, the C<header.xml> contains
1449the metadata, and the annotation layers are stored in subfolders
1450like C<base>, C<struct> or C<corenlp>
1451(so-called "foundries"; Bański et al. 2013).
1452
1453Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001454(Lüngen and Sperberg-McQueen 2012). See the documentation in
1455L<KorAP::XML::Meta::I5> for translatable fields.
1456
1457Annotations correspond to a variant of the TEI-P5 feature structures
1458(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001459Annotation feature structures refer to character sequences of the primary text
1460inside the C<text> element of the C<data.xml>.
1461A single annotation containing the lemma of a token can have the following structure:
1462
1463 <span from="0" to="3">
1464 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1465 <f name="lex">
1466 <fs>
1467 <f name="lemma">zum</f>
1468 </fs>
1469 </f>
1470 </fs>
1471 </span>
1472
1473The C<from> and C<to> attributes are refering to the character span
1474in the primary text.
1475Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1476the structure may vary. See L<KorAP::XML::Annotation::*> for various
1477annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001478
1479Multiple KorAP-XML documents are organized on three levels following
1480the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1481corpus E<gt> document E<gt> text. On each level metadata information
1482can be stored, that C<korapxml2krill> will merge to a single metadata
1483object per text. A corpus is therefore structured as follows:
1484
1485 + <corpus>
1486 - header.xml
1487 + <document>
1488 - header.xml
1489 + <text>
1490 - data.xml
1491 - header.xml
1492 - ...
1493 - ...
1494
1495A single text can be identified by the concatenation of
1496the corpus identifier, the document identifier and the text identifier.
1497This identifier is called the text sigle
1498(e.g. a text with the identifier C<18486> in the document C<060> in the
1499corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1500
1501These corpora are often stored in zip files, with which C<korapxml2krill>
1502can deal with. Corpora may also be split in multiple zip archives
1503(e.g. one zip file per foundry), which is also supported (see C<--input>).
1504
1505Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1506in form of a test suite.
1507The resulting JSON format merges all annotation layers
1508based on a single token stream.
1509
1510=head2 References
1511
1512Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1513KorAP data model: first approximation, December.
1514
1515Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1516"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1517Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1518L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1519
1520Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1521"Robust corpus architecture: a new look at virtual collections and data access",
1522Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1523L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1524
1525Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1526Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1527"Towards an international standard on featurestructure representation",
1528Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1529pp. 373-376.
1530L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1531
1532Harald Lüngen and C. M. Sperberg-McQueen (2012):
1533"A TEI P5 Document Grammar for the IDS Text Model",
1534Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1535L<PDF|https://journals.openedition.org/jtei/pdf/508>
1536
1537TEI Consortium, eds:
1538"Feature Structures",
1539Guidelines for Electronic Text Encoding and Interchange.
1540L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1541
Akron941c1a62016-02-23 17:41:41 +01001542=head1 AVAILABILITY
1543
1544 https://github.com/KorAP/KorAP-XML-Krill
1545
1546
1547=head1 COPYRIGHT AND LICENSE
1548
Akron8f69d632020-01-15 16:58:11 +01001549Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001550
Akron8f69d632020-01-15 16:58:11 +01001551Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001552
Akrona76d8352016-10-27 16:27:32 +02001553Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001554
1555L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1556Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001557L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001558member of the
Akronf1849aa2019-12-16 23:35:33 +01001559L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001560
1561This program is free software published under the
1562L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1563
1564=cut