blob: 791d1153e4b0301390944e9a9280352403a40548 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000012use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010013use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010014use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010015use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010018use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020019use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020020use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010021use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010022use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
Akronf1849aa2019-12-16 23:35:33 +0100146# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100149# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200150#
151# 2020/04/23
152# - Added support for Redewiedergabe-Korpus structure
153# annotations, based on sentence and paragraph milestones
154# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100155# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100156
Akronf8df2162020-08-07 15:03:39 +0200157our $LAST_CHANGE = '2020/08/07';
Akron941c1a62016-02-23 17:41:41 +0100158our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100159our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100160our $VERSION_MSG = <<"VERSION";
161Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
162VERSION
163
Akron941c1a62016-02-23 17:41:41 +0100164# Parse comand
165my $cmd;
166our @ARGV;
167if ($ARGV[0] && index($ARGV[0], '-') != 0) {
168 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100169};
Akron63f20d42017-04-10 23:40:29 +0200170my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100171
Akron5f51d422016-08-16 16:26:43 +0200172my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200173
174# Configuration hash
175my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100176
Akron941c1a62016-02-23 17:41:41 +0100177# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000178GetOptions(
Akron08385f62016-03-22 20:37:04 +0100179 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200180 'input-base|ib=s' => \($cfg{input_base}),
181 'output|o=s' => \($cfg{output}),
182 'overwrite|w' => \($cfg{overwrite}),
183 'meta|m=s' => \($cfg{meta}),
184 'token|t=s' => \($cfg{token}),
185 'base-sentences|bs=s' => \($cfg{base_sentences}),
186 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
187 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
188 'gzip|z' => \($cfg{gzip}),
189 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100190 'skip|s=s' => \@skip,
191 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200192 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200193 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200194 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200195 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200196 'primary|p!' => sub {
197 warn 'Primary flag no longer supported!';
198 },
Akron941c1a62016-02-23 17:41:41 +0100199 'pretty|y' => \(my $pretty),
Akronf8df2162020-08-07 15:03:39 +0200200 'jobs|j=i' => \($cfg{jobs}),
201 'koral|k=f' => \($cfg{koral}),
202 'to-tar' => \($cfg{to_tar}),
203 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
204 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
205 'sequential-extraction|se' => \($cfg{sequential_extraction}),
206 'cache-size|cs=s' => \($cfg{cache_size}),
207 'cache-delete|cd!' => \($cfg{cache_delete}),
208 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100209 'help|h' => sub {
210 pod2usage(
211 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200212 -verbose => 99,
213 -msg => $VERSION_MSG,
214 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100215 );
216 },
217 'version|v' => sub {
218 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200219 -verbose => 0,
220 -msg => $VERSION_MSG,
221 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100222 )
223 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000224);
225
Akrone512b7c2020-08-07 16:16:12 +0200226my %ERROR_HASH = (
227 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
228 -verbose => 99,
229 -msg => $VERSION_MSG,
230 -output => '-',
231 -exit => 1
232);
Akron63f20d42017-04-10 23:40:29 +0200233
Akronf8df2162020-08-07 15:03:39 +0200234# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200235if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200236 my %config;
237
Akronf8df2162020-08-07 15:03:39 +0200238 print "Reading config from $cfg_file\n";
239
Akron636aa112017-04-07 18:48:56 +0200240 Config::Simple->import_from($cfg_file, \%config);
241
Akronf8df2162020-08-07 15:03:39 +0200242 foreach (qw!output cache-size input-base token overwrite
243 meta base-sentences base-paragraphs base-pagebreaks
244 gzip to-tar log cache non-word-tokens
245 non-verbal-tokens sequential-extraction cache-init
246 koral extract-dir jobs!) {
247 my $underlined = $_ =~ tr/-/_/r;
248 if (!defined($cfg{$underlined}) && defined $config{$_}) {
249 $cfg{$underlined} = $config{$_};
250 };
Akron636aa112017-04-07 18:48:56 +0200251 };
252
253 # Skip
254 if (!scalar(@skip) && defined $config{'skip'}) {
255 @skip = split /\s*;\s*/, $config{'skip'} ;
256 };
257
258 # Sigle
259 if (!scalar(@sigle) && defined $config{'sigle'}) {
260 @sigle = split /\s*;\s*/, $config{'sigle'} ;
261 };
262
263 # Anno
264 if (!scalar(@anno) && defined $config{'anno'}) {
265 @anno = split /\s*;\s*/, $config{'anno'} ;
266 };
267};
268
Akronf8df2162020-08-07 15:03:39 +0200269# Init variables and set default values
270my $output = $cfg{output};
271my $input_base = $cfg{input_base};
272my $gzip = $cfg{gzip};
273my $to_tar = $cfg{to_tar};
274my $extract_dir = $cfg{extract_dir};
275my $token_base = $cfg{token} // 'OpenNLP#tokens';
276my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
277my $jobs = $cfg{jobs} // 0;
278my $cache_delete = $cfg{cache_delete} // 1;
279my $base_sentences = lc($cfg{base_sentences} // '');
280my $base_paragraphs = lc($cfg{base_paragraphs} // '');
281my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
282my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200283
Akronf8df2162020-08-07 15:03:39 +0200284# Get tokenization basis
285my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200286
Akronf8df2162020-08-07 15:03:39 +0200287# Remove file extension
288$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100289
Akronf8df2162020-08-07 15:03:39 +0200290# Convert sigle to path construct
291s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
292
293my %skip;
294$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200295
296# Initialize log4perl object
297Log::Log4perl->init({
Akronf8df2162020-08-07 15:03:39 +0200298 'log4perl.rootLogger' => uc($cfg{log} // 'ERROR') . ', STDERR',
Akron63f20d42017-04-10 23:40:29 +0200299 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
300 'log4perl.appender.STDERR.layout' => 'PatternLayout',
301 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
302});
303
304my $log = Log::Log4perl->get_logger('main');
305
Akronf8df2162020-08-07 15:03:39 +0200306if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
307 $log->error("Directory '$output' does not exist.");
308 exit 1;
309};
Akron63f20d42017-04-10 23:40:29 +0200310
Akron941c1a62016-02-23 17:41:41 +0100311# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100312pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000313
Akrone1dbc382016-07-08 22:24:52 +0200314# Gzip has no effect, if no output is given
315pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000316
Akronc11f7982017-02-21 21:20:14 +0100317
Akron63f20d42017-04-10 23:40:29 +0200318# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200319if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200320
Akron63f20d42017-04-10 23:40:29 +0200321 # Remove all inputs
322 my $remove_next = 0;
323 @keep_argv = @{c(@keep_argv)->grep(
324 sub {
325 # Input flag
326 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
327 $remove_next = 1;
328 return 0;
329 }
330
331 # input value
332 elsif ($remove_next) {
333 $remove_next = 0;
334 return 0;
335 };
336
337 # Pass parameter
338 return 1;
339 }
340 )->to_array};
341
342
343 # Iterate over all inputs
344 foreach (@input) {
345
Akron081639e2017-04-21 19:01:39 +0200346 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200347 my $new_out = catdir($output, get_file_name_from_glob($_));
348
Akron486f9ab2017-04-22 23:25:19 +0200349 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200350 unless ($to_tar) {
351 if (make_path($new_out) == 0 && !-d $new_out) {
352 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200353 exit 1;
Akron081639e2017-04-21 19:01:39 +0200354 };
Akron63f20d42017-04-10 23:40:29 +0200355 };
356
357 # Create archive command
358 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
359 print "Start serial processing of $_ to $new_out\n";
360
361 # Start archiving
362 system @archive_cmd;
363 };
364
Akron3abc03e2017-06-29 16:23:35 +0200365 exit;
Akron63f20d42017-04-10 23:40:29 +0200366};
367
Akrone512b7c2020-08-07 16:16:12 +0200368
Akrone1dbc382016-07-08 22:24:52 +0200369my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100370push(@layers, ['Base', 'Sentences']) unless $base_sentences;
371push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200372
373# Connexor
374push(@layers, ['Connexor', 'Morpho']);
375push(@layers, ['Connexor', 'Syntax']);
376push(@layers, ['Connexor', 'Phrase']);
377push(@layers, ['Connexor', 'Sentences']);
378
379# CoreNLP
380push(@layers, ['CoreNLP', 'NamedEntities']);
381push(@layers, ['CoreNLP', 'Sentences']);
382push(@layers, ['CoreNLP', 'Morpho']);
383push(@layers, ['CoreNLP', 'Constituency']);
384
Akronce125b62017-06-19 11:54:36 +0200385# CMC
386push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100387
Akrone1dbc382016-07-08 22:24:52 +0200388# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100389my @dereko_attr = ();
390if ($base_sentences eq 'dereko#structure') {
391 push @dereko_attr, 'sentences';
392};
393if ($base_paragraphs eq 'dereko#structure') {
394 push @dereko_attr, 'paragraphs';
395};
Akron636bd9c2017-02-09 17:13:00 +0100396
Akron41ac10b2017-02-08 22:47:25 +0100397if ($base_pagebreaks eq 'dereko#structure') {
398 push @dereko_attr, 'pagebreaks';
399};
400
401if ($dereko_attr[0]) {
402 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100403}
404else {
405 push(@layers, ['DeReKo', 'Structure']);
406};
Akrone1dbc382016-07-08 22:24:52 +0200407
Akron57510c12019-01-04 14:58:53 +0100408# DGD
409push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100410if ($base_sentences eq 'dgd#structure') {
411 push(@layers, ['DGD', 'Structure', 'base-sentence']);
412}
Akron57510c12019-01-04 14:58:53 +0100413
414# DRuKoLa
415push(@layers, ['DRuKoLa', 'Morpho']);
416
Akrone1dbc382016-07-08 22:24:52 +0200417# Glemm
418push(@layers, ['Glemm', 'Morpho']);
419
Akronea1aed52018-07-19 14:43:34 +0200420# HNC
421push(@layers, ['HNC', 'Morpho']);
422
Akron4c679192018-01-16 17:41:49 +0100423# LWC
424push(@layers, ['LWC', 'Dependency']);
425
Akrone1dbc382016-07-08 22:24:52 +0200426# Malt
427push(@layers, ['Malt', 'Dependency']);
428
Akron57510c12019-01-04 14:58:53 +0100429# Marmot
430push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200431
432# Mate
433push(@layers, ['Mate', 'Morpho']);
434push(@layers, ['Mate', 'Dependency']);
435
Akron57510c12019-01-04 14:58:53 +0100436# MDParser
437push(@layers, ['MDParser', 'Dependency']);
438
Akrone1dbc382016-07-08 22:24:52 +0200439# OpenNLP
440push(@layers, ['OpenNLP', 'Morpho']);
441push(@layers, ['OpenNLP', 'Sentences']);
442
Akron07e24772020-04-23 14:00:54 +0200443# Redewiedergabe
444push(@layers, ['RWK', 'Morpho']);
445if ($base_sentences eq 'rwk#structure') {
446 push(@layers, ['RWK', 'Structure']);
447};
448
Akrone1dbc382016-07-08 22:24:52 +0200449# Schreibgebrauch
450push(@layers, ['Sgbr', 'Lemma']);
451push(@layers, ['Sgbr', 'Morpho']);
452
Akron7d5e6382019-08-08 16:36:27 +0200453# Talismane
454push(@layers, ['Talismane', 'Dependency']);
455push(@layers, ['Talismane', 'Morpho']);
456
Akrone1dbc382016-07-08 22:24:52 +0200457# TreeTagger
458push(@layers, ['TreeTagger', 'Morpho']);
459push(@layers, ['TreeTagger', 'Sentences']);
460
461# XIP
462push(@layers, ['XIP', 'Morpho']);
463push(@layers, ['XIP', 'Constituency']);
464push(@layers, ['XIP', 'Sentences']);
465push(@layers, ['XIP', 'Dependency']);
466
Akron4fa37c32017-01-20 14:43:10 +0100467
Akrone1dbc382016-07-08 22:24:52 +0200468# Check filters
469my @filtered_anno;
470if ($skip{'#all'}) {
471 foreach (@anno) {
472 push @filtered_anno, [ split('#', $_) ];
473 };
474}
475
476# Add all annotations that are not skipped
477else {
478 # Add to index file - respect skipping
479 foreach my $info (@layers) {
480 # Skip if Foundry or Foundry#Layer should be skipped
481 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
482 push @filtered_anno, $info;
483 };
484 };
485};
486
Akrone1dbc382016-07-08 22:24:52 +0200487
488# TODO: This should not be initialized for batch
489my $cache = Cache::FastMmap->new(
490 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200491 cache_size => ($cfg{cache_size} // '50m'),
492 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200493);
494
Akron03b24db2016-08-16 20:54:32 +0200495# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200496my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200497 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200498 meta_type => $cfg{meta},
499 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200500 foundry => $token_base_foundry,
501 layer => $token_base_layer,
502 gzip => $gzip,
503 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200504 koral => ($cfg{koral} // $KORAL_VERSION),
Akron03b24db2016-08-16 20:54:32 +0200505 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100506 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200507 non_word_tokens => ($cfg{non_word_tokens} // 0),
508 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200509);
510
Akrone512b7c2020-08-07 16:16:12 +0200511
512# Auto adjust jobs
513if ($jobs eq '-1') {
514 my $cores = Sys::Info->new->device('CPU')->count;
515 $jobs = ceil(5 * $cores);
516 $log->info("Run using $jobs jobs on $cores cores");
517};
518
519
Akron63f20d42017-04-10 23:40:29 +0200520# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200521if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200522
Akron821db3d2017-04-06 21:19:31 +0200523 my @new_input = ();
524
525 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200526 foreach my $wild_card (@input) {
527
528 # Prefix with input root
529 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
530
531 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200532 };
533
Akron63f20d42017-04-10 23:40:29 +0200534 # Sort files by length
535 @input = sort { length($a) <=> length($b) } @new_input;
536
537 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200538};
539
540
Akron941c1a62016-02-23 17:41:41 +0100541# Process a single file
542unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100543 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000544
Akron941c1a62016-02-23 17:41:41 +0100545 BEGIN {
546 $main::TIME = Benchmark->new;
547 $main::LAST_STOP = Benchmark->new;
548 };
549
550 sub stop_time {
551 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200552 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100553 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200554 timestr(timediff($new, $main::LAST_STOP)) .
555 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
556 );
Akron941c1a62016-02-23 17:41:41 +0100557 $main::LAST_STOP = $new;
558 };
559
560 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200561 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100562
Akron7d4cdd82016-08-17 21:39:45 +0200563 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200564 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100565
Akron11c80302016-03-18 19:44:43 +0100566 # Delete cache file
567 unlink($cache_file) if $cache_delete;
568
Akron5f51d422016-08-16 16:26:43 +0200569 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200570 exit;
Akron81500102017-04-07 20:45:44 +0200571};
572
Nils Diewald59094f22014-11-05 18:20:50 +0000573
Akrone10ad322016-02-27 10:54:26 +0100574# Extract XML files
Akron81500102017-04-07 20:45:44 +0200575if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100576
Akrond5643ad2017-07-04 20:27:13 +0200577 # Output is required
578 pod2usage(%ERROR_HASH) unless $output;
579
Akron7d4cdd82016-08-17 21:39:45 +0200580 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200581 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100582
Akron7d4cdd82016-08-17 21:39:45 +0200583 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100584 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200585 $log->error("Unzip is not installed or incompatible.");
586 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100587 };
588
Akronb0c88db2016-06-29 16:33:18 +0200589 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200590 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200591
Akron31a08cb2019-02-20 20:43:26 +0100592 # Will set @sigle
593 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200594
Akrone10ad322016-02-27 10:54:26 +0100595 # Iterate over all given sigles and extract
596 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100597
Akron2812ba22016-10-28 21:55:59 +0200598 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200599
Akron03b24db2016-08-16 20:54:32 +0200600 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200601 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100602
Akron955b75b2019-02-21 14:28:41 +0100603 # TODO:
604 # - prefix???
605 $archive->extract_sigle([$_], $output, $jobs)
606 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200607 );
Akrone10ad322016-02-27 10:54:26 +0100608 print "extracted.\n";
609 };
Akronb0c88db2016-06-29 16:33:18 +0200610 }
Akron7d4cdd82016-08-17 21:39:45 +0200611
612 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200613 else {
614 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200615 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100616 };
617}
618
Akron81500102017-04-07 20:45:44 +0200619
Akron941c1a62016-02-23 17:41:41 +0100620# Process an archive
621elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000622
Akron81500102017-04-07 20:45:44 +0200623 my $archive_output;
624
625 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100626 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200627
628 # Create new archive object
629 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
630
631 # Check zip capabilities
632 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200633 $log->error("Unzip is not installed or incompatible.");
634 exit 1;
Akron81500102017-04-07 20:45:44 +0200635 };
636
637 # Add further annotation archived
638 $archive->attach($_) foreach @input[1..$#input];
639
640 # Create a temporary directory
641 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200642 $extract_dir = tempdir(CLEANUP => 0);
643 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200644 };
645
Akron63f20d42017-04-10 23:40:29 +0200646 # Add some random extra to avoid clashes with multiple archives
647 $extract_dir = catdir($extract_dir, random_string('cccccc'));
648
Akron31a08cb2019-02-20 20:43:26 +0100649 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200650 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200651 @input = ($extract_dir);
652 }
653 else {
654 $log->error('Unable to extract from primary archive ' . $input[0] .
655 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200656 exit 1;
Akron81500102017-04-07 20:45:44 +0200657 };
658 }
659
660 # Can't create archive object
661 else {
662 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200663 exit 1;
Akron81500102017-04-07 20:45:44 +0200664 };
665 };
666
Akron7d4cdd82016-08-17 21:39:45 +0200667 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100668 my $pool = Parallel::ForkManager->new($jobs);
669
Akron7d4cdd82016-08-17 21:39:45 +0200670 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100671 my $iter = 1; # Current text in process
672
Akronda3097e2017-04-23 19:53:57 +0200673 my $tar_archive;
674 my $output_dir = $output;
675 my $tar_fh;
676
677 # Initialize tar archive
678 if ($to_tar) {
679 $tar_archive = Archive::Tar::Builder->new(
680 ignore_errors => 1
681 );
682
683 # Set output name
684 my $tar_file = $output;
685 unless ($tar_file =~ /\.tar$/) {
686 $tar_file .= '.tar';
687 };
688
689 # Initiate the tar file
690 print "Writing to file $tar_file\n";
691 $tar_fh = IO::File->new($tar_file, 'w');
692 $tar_fh->binmode(1);
693
694 # Set handle
695 $tar_archive->set_handle($tar_fh);
696
697 # Output to temporary directory
698 $output_dir = File::Temp->newdir;
699 };
700
Akron941c1a62016-02-23 17:41:41 +0100701 # Report on fork message
702 $pool->run_on_finish (
703 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200704 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100705 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200706
Akron08385f62016-03-22 20:37:04 +0100707 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200708 ($iter++) . "/$count]" .
709 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200710 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200711
712 if (!$code && $to_tar && $data->[2]) {
713 my $filename = $data->[2];
714
715 # Lock filehandle
716 if (flock($tar_fh, LOCK_EX)) {
717
Akron9a062ce2017-07-04 19:12:05 +0200718 my $clean_file = fileparse($filename);
719
Akronda3097e2017-04-23 19:53:57 +0200720 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200721 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200722 unlink $filename;
723
724 # Unlock filehandle
725 flock($tar_fh, LOCK_UN);
726 }
727 else {
728 $log->warn("Unable to add $filename to archive");
729 };
730 };
731
Akron4c0cf312016-10-15 16:42:09 +0200732 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100733 }
734 );
735
736 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200737 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100738 print "Reading data ...\n";
739
Akron7d4cdd82016-08-17 21:39:45 +0200740 # unless (Cache::FastMmap->new(
741 # share_file => $cache_file,
742 # cache_size => $cache_size,
743 # init_file => $cache_init
744 # )) {
745 # print "Unable to intialize cache '$cache_file'\n\n";
746 # exit(1);
747 # };
Akron11c80302016-03-18 19:44:43 +0100748
Akron486f9ab2017-04-22 23:25:19 +0200749
Akron941c1a62016-02-23 17:41:41 +0100750 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100751 if (-d $input[0]) {
752 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100753 my @dirs;
754 my $dir;
755
Akron7d4cdd82016-08-17 21:39:45 +0200756 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100757 while (1) {
758 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200759 push @dirs, $dir;
760 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100761 };
762 last unless $it->next;
763 };
764
765 print "Start processing ...\n";
766 $t = Benchmark->new;
767 $count = scalar @dirs;
768
769 DIRECTORY_LOOP:
770 for (my $i = 0; $i < $count; $i++) {
771
Akrone1dbc382016-07-08 22:24:52 +0200772 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200773 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200774 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200775 );
Akron941c1a62016-02-23 17:41:41 +0100776
777 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200778 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200779
Akron13d56622016-10-31 14:54:49 +0100780 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200781 $pool->finish(
782 0,
Akronda3097e2017-04-23 19:53:57 +0200783 [
784 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
785 undef,
786 $filename
787 ]
Akron486f9ab2017-04-22 23:25:19 +0200788 );
Akron3ec48972016-08-17 23:24:52 +0200789 }
790 else {
Akron4c0cf312016-10-15 16:42:09 +0200791 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200792 };
Akron941c1a62016-02-23 17:41:41 +0100793 };
794 }
795
796 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200797 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200798
Akron941c1a62016-02-23 17:41:41 +0100799 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200800 $log->error("Unzip is not installed or incompatible.");
801 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100802 };
803
Akron08385f62016-03-22 20:37:04 +0100804 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200805 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100806
Akron31a08cb2019-02-20 20:43:26 +0100807 # Get sigles to extract
808 my $prefix = set_sigle($archive);
809
Akron941c1a62016-02-23 17:41:41 +0100810 print "Start processing ...\n";
811 $t = Benchmark->new;
812 my @dirs = $archive->list_texts;
813 $count = scalar @dirs;
814
815 ARCHIVE_LOOP:
816 for (my $i = 0; $i < $count; $i++) {
817
818 # Split path information
819 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
820
Akrone1dbc382016-07-08 22:24:52 +0200821 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200822 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200823 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200824 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200825 catfile($corpus, $doc, $text)
826 . '.json' . ($gzip ? '.gz' : '')
827 )
Akrone1dbc382016-07-08 22:24:52 +0200828 );
Akron941c1a62016-02-23 17:41:41 +0100829
830 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200831 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100832
Akron4c0cf312016-10-15 16:42:09 +0200833 # Create temporary file
834 $temp = File::Temp->newdir;
835
Akronbdf434a2016-10-24 17:42:07 +0200836 # TODO: Check if $filename exist at the beginning,
837 # because extraction can be horrible slow!
838
Akron941c1a62016-02-23 17:41:41 +0100839 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100840 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100841
Akron7d4cdd82016-08-17 21:39:45 +0200842 # Create corpus directory
843 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100844
Akron7d4cdd82016-08-17 21:39:45 +0200845 # Temporary directory
846 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100847
Akron7d4cdd82016-08-17 21:39:45 +0200848 # Write file
Akron13d56622016-10-31 14:54:49 +0100849 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200850
Akron4c0cf312016-10-15 16:42:09 +0200851 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100852 $pool->finish(
853 0,
Akronda3097e2017-04-23 19:53:57 +0200854 [
855 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
856 $temp,
857 $filename
858 ]
Akron13d56622016-10-31 14:54:49 +0100859 );
Akron7d4cdd82016-08-17 21:39:45 +0200860 }
861 else {
Akron4c0cf312016-10-15 16:42:09 +0200862 # Delete temporary file
863 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200864 };
Akron941c1a62016-02-23 17:41:41 +0100865 }
Akron7d4cdd82016-08-17 21:39:45 +0200866
867 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100868 else {
Akron4c0cf312016-10-15 16:42:09 +0200869 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100870 };
871 };
872 }
873
874 else {
875 print "Input is neither a directory nor an archive.\n\n";
876 };
877
878 $pool->wait_all_children;
879
Akron11c80302016-03-18 19:44:43 +0100880 # Delete cache file
881 unlink($cache_file) if $cache_delete;
882
Akronda3097e2017-04-23 19:53:57 +0200883 # Close tar filehandle
884 if ($to_tar && $tar_fh) {
885 $tar_archive->finish;
886 $tar_fh->close;
887 print "Wrote to tar archive.\n";
888 };
889
Akron63f20d42017-04-10 23:40:29 +0200890 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100891 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200892};
Akron941c1a62016-02-23 17:41:41 +0100893
Nils Diewald2db9ad02013-10-29 19:26:43 +0000894
Akron31a08cb2019-02-20 20:43:26 +0100895# For an archive, this will create the list
896# of all sigles to process
897sub set_sigle {
898 my $archive = shift;
899
900 my $prefix = 1;
901 my @dirs = ();
902
903 # No sigles given
904 unless (@sigle) {
905
906 # Get files
907 foreach ($archive->list_texts) {
908
909 push @dirs, $_;
910
911 # Split path information
912 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
913
914 # TODO: Make this OS independent
915 push @sigle, join '/', $corpus, $doc, $text;
916 };
917 }
918
919 # Check sigle for doc sigles
920 else {
921 my @new_sigle;
922
923 my $prefix_check = 0;
924
925 # Iterate over all sigle
926 foreach (@sigle) {
927
928 # Sigle is a doc sigle
929 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
930
931 print "$_ ...";
932 # Check if a prefix is needed
933 unless ($prefix_check) {
934
935 if ($prefix = $archive->check_prefix) {
936 print " with prefix ...";
937 };
938 $prefix_check = 1;
939 };
940
941 print "\n";
942
Akron31a08cb2019-02-20 20:43:26 +0100943 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100944 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
945 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100946 );
947 print "extracted.\n";
948 }
949
950 # Sigle is a text sigle
951 else {
952 push @new_sigle, $_;
953
954 unless ($prefix_check) {
955
956 if ($prefix = $archive->check_prefix) {
957 print " with prefix ...";
958 };
959 $prefix_check = 1;
960 };
961 };
962 };
963 @sigle = @new_sigle;
964 };
965
966 return $prefix;
967};
968
969
Akron63f20d42017-04-10 23:40:29 +0200970# Cleanup temporary extraction directory
971if ($extract_dir) {
972 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +0200973 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +0200974};
975
976
977print "\n";
978
Nils Diewald2db9ad02013-10-29 19:26:43 +0000979__END__
Akron941c1a62016-02-23 17:41:41 +0100980
981=pod
982
983=encoding utf8
984
985=head1 NAME
986
Akron42f48c12020-02-14 13:08:13 +0100987korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100988
989
990=head1 SYNOPSIS
991
Akrona76d8352016-10-27 16:27:32 +0200992 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100993
Akron2fd402b2016-10-27 21:26:48 +0200994
Akron941c1a62016-02-23 17:41:41 +0100995=head1 DESCRIPTION
996
997L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
998compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +0100999The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001000
1001
1002=head1 INSTALLATION
1003
1004The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1005
Akronaf386982016-10-12 00:33:25 +02001006 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001007
Akronc13a1702016-03-15 19:33:14 +01001008In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001009be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001010Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001011In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001012
1013=head1 ARGUMENTS
1014
Akrona76d8352016-10-27 16:27:32 +02001015 $ korapxml2krill -z --input <directory> --output <filename>
1016
1017Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001018It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001019
Akron941c1a62016-02-23 17:41:41 +01001020=over 2
1021
1022=item B<archive>
1023
Akron081639e2017-04-21 19:01:39 +02001024 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001025
Akron2fd402b2016-10-27 21:26:48 +02001026Converts an archive of KorAP-XML documents. It expects a directory
1027(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001028
1029=item B<extract>
1030
Akrona76d8352016-10-27 16:27:32 +02001031 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1032
1033Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001034
Akron63f20d42017-04-10 23:40:29 +02001035=item B<serial>
1036
1037 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1038
1039Convert archives sequentially. The inputs are not merged but treated
1040as they are (so they may be premerged or globs).
1041the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001042are created based on the archive name. In case the C<--to-tar> flag is given,
1043the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001044
1045
Akron941c1a62016-02-23 17:41:41 +01001046=back
1047
1048
1049=head1 OPTIONS
1050
1051=over 2
1052
Akrona76d8352016-10-27 16:27:32 +02001053=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001054
Akrona76d8352016-10-27 16:27:32 +02001055Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001056
Akron7606afa2016-10-25 16:23:49 +02001057Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001058document, while C<archive> expects a KorAP-XML corpus folder or a zip
1059file to batch process multiple files.
1060C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001061
Akrona76d8352016-10-27 16:27:32 +02001062C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001063that the first archive listed contains all primary data files
1064and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001065
Akron7606afa2016-10-25 16:23:49 +02001066 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001067
Akron821db3d2017-04-06 21:19:31 +02001068Input may also be defined using BSD glob wildcards.
1069
1070 -i 'file/news*.zip'
1071
1072The extended input array will be sorted in length order, so the shortest
1073path needs to contain all primary data files and all meta data files.
1074
Akron0c3e3752016-06-28 15:55:53 +02001075(The directory structure follows the base directory format,
1076that may include a C<.> root folder.
1077In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001078need to be passed with a hash sign in front of the archive's name.
1079This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001080
Akron7606afa2016-10-25 16:23:49 +02001081To support zip files, a version of C<unzip> needs to be installed that is
1082compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001083
Akron7606afa2016-10-25 16:23:49 +02001084B<The root folder switch using the hash sign is experimental and
1085may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001086
Akronf73ffb62018-06-27 12:13:59 +02001087
Akron63f20d42017-04-10 23:40:29 +02001088=item B<--input-base|-ib> <directory>
1089
1090The base directory for inputs.
1091
1092
Akron941c1a62016-02-23 17:41:41 +01001093=item B<--output|-o> <directory|file>
1094
1095Output folder for archive processing or
1096document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001097writes to C<STDOUT> by default
1098(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001099
1100=item B<--overwrite|-w>
1101
1102Overwrite files that already exist.
1103
Akronf73ffb62018-06-27 12:13:59 +02001104
Akron3741f8b2016-12-21 19:55:21 +01001105=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001106
1107Define the default tokenization by specifying
1108the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001109of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001110This will directly take the file instead of running
1111the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001112
Akron3741f8b2016-12-21 19:55:21 +01001113
1114=item B<--base-sentences|-bs> <foundry>#<layer>
1115
1116Define the layer for base sentences.
1117If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001118Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1119layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001120
1121 Defaults to unset.
1122
1123
1124=item B<--base-paragraphs|-bp> <foundry>#<layer>
1125
1126Define the layer for base paragraphs.
1127If given, this will be used instead of using C<Base#Paragraphs>.
1128Currently C<DeReKo#Structure> is the only additional layer supported.
1129
1130 Defaults to unset.
1131
1132
Akron41ac10b2017-02-08 22:47:25 +01001133=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1134
1135Define the layer for base pagebreaks.
1136Currently C<DeReKo#Structure> is the only layer supported.
1137
1138 Defaults to unset.
1139
1140
Akron941c1a62016-02-23 17:41:41 +01001141=item B<--skip|-s> <foundry>[#<layer>]
1142
Akronf7ad89e2016-03-16 18:22:47 +01001143Skip specific annotations by specifying the foundry
1144(and optionally the layer with a C<#>-prefix),
1145e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001146Can be set multiple times.
1147
Akronf73ffb62018-06-27 12:13:59 +02001148
Akronc13a1702016-03-15 19:33:14 +01001149=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001150
Akronf7ad89e2016-03-16 18:22:47 +01001151Convert specific annotations by specifying the foundry
1152(and optionally the layer with a C<#>-prefix),
1153e.g. C<Mate> or C<Mate#Morpho>.
1154Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001155
Akronf73ffb62018-06-27 12:13:59 +02001156
Akroned9baf02019-01-22 17:03:25 +01001157=item B<--non-word-tokens|-nwt>
1158
1159Tokenize non-word tokens like word tokens (defined as matching
1160C</[\d\w]/>). Useful to treat punctuations as tokens.
1161
1162 Defaults to unset.
1163
Akronf1849aa2019-12-16 23:35:33 +01001164
1165=item B<--non-verbal-tokens|-nvt>
1166
1167Tokenize non-verbal tokens marked as in the primary data as
1168the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1169
1170 Defaults to unset.
1171
1172
Akron941c1a62016-02-23 17:41:41 +01001173=item B<--jobs|-j>
1174
1175Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001176for archive processing.
Akron11c80302016-03-18 19:44:43 +01001177Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001178
1179If C<sequential-extraction> is not set to false, this will
1180also apply to extraction.
1181
Akronc11f7982017-02-21 21:20:14 +01001182Pass -1, and the value will be set automatically to 5
1183times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001184This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001185
Akronf73ffb62018-06-27 12:13:59 +02001186
Akron263274c2019-02-07 09:48:30 +01001187=item B<--koral|-k>
1188
1189Version of the output format. Supported versions are:
1190C<0> for legacy serialization, C<0.03> for serialization
1191with metadata fields as key-values on the root object,
1192C<0.4> for serialization with metadata fields as a list
1193of C<"@type":"koral:field"> objects.
1194
1195Currently defaults to C<0.03>.
1196
1197
Akron9ec88872017-04-12 16:29:06 +02001198=item B<--sequential-extraction|-se>
1199
1200Flag to indicate, if the C<jobs> value also applies to extraction.
1201Some systems may have problems with extracting multiple archives
1202to the same folder at the same time.
1203Can be flagged using C<--no-sequential-extraction> as well.
1204Defaults to C<false>.
1205
Akronf73ffb62018-06-27 12:13:59 +02001206
Akron35db6e32016-03-17 22:42:22 +01001207=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001208
Akron35db6e32016-03-17 22:42:22 +01001209Define the metadata parser to use. Defaults to C<I5>.
1210Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1211This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001212
Akronf73ffb62018-06-27 12:13:59 +02001213
Akron941c1a62016-02-23 17:41:41 +01001214=item B<--pretty|-y>
1215
Akronc13a1702016-03-15 19:33:14 +01001216Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001217This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001218
Akronf73ffb62018-06-27 12:13:59 +02001219
Akron941c1a62016-02-23 17:41:41 +01001220=item B<--gzip|-z>
1221
Akronf7ad89e2016-03-16 18:22:47 +01001222Compress the output.
1223Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001224
Akronf73ffb62018-06-27 12:13:59 +02001225
Akron11c80302016-03-18 19:44:43 +01001226=item B<--cache|-c>
1227
1228File to mmap a cache (using L<Cache::FastMmap>).
1229Defaults to C<korapxml2krill.cache> in the calling directory.
1230
Akronf73ffb62018-06-27 12:13:59 +02001231
Akron11c80302016-03-18 19:44:43 +01001232=item B<--cache-size|-cs>
1233
1234Size of the cache. Defaults to C<50m>.
1235
Akronf73ffb62018-06-27 12:13:59 +02001236
Akron11c80302016-03-18 19:44:43 +01001237=item B<--cache-init|-ci>
1238
1239Initialize cache file.
1240Can be flagged using C<--no-cache-init> as well.
1241Defaults to C<true>.
1242
Akronf73ffb62018-06-27 12:13:59 +02001243
Akron11c80302016-03-18 19:44:43 +01001244=item B<--cache-delete|-cd>
1245
1246Delete cache file after processing.
1247Can be flagged using C<--no-cache-delete> as well.
1248Defaults to C<true>.
1249
Akronf73ffb62018-06-27 12:13:59 +02001250
Akron636aa112017-04-07 18:48:56 +02001251=item B<--config|-cfg>
1252
1253Configure the parameters of your call in a file
1254of key-value pairs with whitespace separator
1255
1256 overwrite 1
1257 token DeReKo#Structure
1258 ...
1259
1260Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001261C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001262C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001263C<output>, C<koral>,
1264C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001265C<base-sentences>, C<base-paragraphs>,
1266C<base-pagebreaks>,
1267C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001268(semicolon separated), C<anno> (semicolon separated).
1269
Akronf73ffb62018-06-27 12:13:59 +02001270Configuration parameters will always be overwritten by
1271passed parameters.
1272
1273
Akron81500102017-04-07 20:45:44 +02001274=item B<--temporary-extract|-te>
1275
1276Only valid for the C<archive> command.
1277
1278This will first extract all files into a
1279directory and then will archive.
1280If the directory is given as C<:temp:>,
1281a temporary directory is used.
1282This is especially useful to avoid
1283massive unzipping and potential
1284network latency.
Akron636aa112017-04-07 18:48:56 +02001285
Akronf73ffb62018-06-27 12:13:59 +02001286
Akronc93a0802019-07-11 15:48:34 +02001287=item B<--to-tar>
1288
1289Only valid for the C<archive> command.
1290
1291Writes the output into a tar archive.
1292
1293
Akrone10ad322016-02-27 10:54:26 +01001294=item B<--sigle|-sg>
1295
Akron20807582016-10-26 17:11:34 +02001296Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001297Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001298I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001299Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001300In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001301On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001302
Akronf73ffb62018-06-27 12:13:59 +02001303
Akron941c1a62016-02-23 17:41:41 +01001304=item B<--log|-l>
1305
1306The L<Log4perl> log level, defaults to C<ERROR>.
1307
Akronf73ffb62018-06-27 12:13:59 +02001308
Akron941c1a62016-02-23 17:41:41 +01001309=item B<--help|-h>
1310
Akron42f48c12020-02-14 13:08:13 +01001311Print help information.
Akron941c1a62016-02-23 17:41:41 +01001312
Akronf73ffb62018-06-27 12:13:59 +02001313
Akron941c1a62016-02-23 17:41:41 +01001314=item B<--version|-v>
1315
1316Print version information.
1317
1318=back
1319
Akronf73ffb62018-06-27 12:13:59 +02001320
Akronc13a1702016-03-15 19:33:14 +01001321=head1 ANNOTATION SUPPORT
1322
1323L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1324developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1325The base foundry with paragraphs, sentences, and the text element are mandatory for
1326L<Krill|https://github.com/KorAP/Krill>.
1327
Akron821db3d2017-04-06 21:19:31 +02001328 Base
1329 #Paragraphs
1330 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001331
Akron821db3d2017-04-06 21:19:31 +02001332 Connexor
1333 #Morpho
1334 #Phrase
1335 #Sentences
1336 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001337
Akron821db3d2017-04-06 21:19:31 +02001338 CoreNLP
1339 #Constituency
1340 #Morpho
1341 #NamedEntities
1342 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001343
Akronce125b62017-06-19 11:54:36 +02001344 CMC
1345 #Morpho
1346
Akron821db3d2017-04-06 21:19:31 +02001347 DeReKo
1348 #Structure
Akronc13a1702016-03-15 19:33:14 +01001349
Akron57510c12019-01-04 14:58:53 +01001350 DGD
1351 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001352 #Structure
Akron57510c12019-01-04 14:58:53 +01001353
Akron821db3d2017-04-06 21:19:31 +02001354 DRuKoLa
1355 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001356
Akron821db3d2017-04-06 21:19:31 +02001357 Glemm
1358 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001359
Akronea1aed52018-07-19 14:43:34 +02001360 HNC
1361 #Morpho
1362
Akron4c679192018-01-16 17:41:49 +01001363 LWC
1364 #Dependency
1365
Akron821db3d2017-04-06 21:19:31 +02001366 Malt
1367 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001368
Akron821db3d2017-04-06 21:19:31 +02001369 MarMoT
1370 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001371
Akron821db3d2017-04-06 21:19:31 +02001372 Mate
1373 #Dependency
1374 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001375
Akron821db3d2017-04-06 21:19:31 +02001376 MDParser
1377 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001378
Akron821db3d2017-04-06 21:19:31 +02001379 OpenNLP
1380 #Morpho
1381 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001382
Akron07e24772020-04-23 14:00:54 +02001383 RWK
1384 #Morpho
1385 #Structure
1386
Akron821db3d2017-04-06 21:19:31 +02001387 Sgbr
1388 #Lemma
1389 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001390
Akron7d5e6382019-08-08 16:36:27 +02001391 Talismane
1392 #Dependency
1393 #Morpho
1394
Akron821db3d2017-04-06 21:19:31 +02001395 TreeTagger
1396 #Morpho
1397 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001398
Akron821db3d2017-04-06 21:19:31 +02001399 XIP
1400 #Constituency
1401 #Morpho
1402 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001403
Akronc13a1702016-03-15 19:33:14 +01001404
1405More importers are in preparation.
1406New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1407See the built-in annotation importers as examples.
1408
Akronf73ffb62018-06-27 12:13:59 +02001409
Akron8f69d632020-01-15 16:58:11 +01001410=head1 About KorAP-XML
1411
1412KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1413data model (Bański et al. 2013), where text data are stored physically
1414separated from their interpretations (i.e. annotations).
1415A text document in KorAP-XML therefore consists of several files
1416containing primary data, metadata and annotations.
1417
1418The structure of a single KorAP-XML document can be as follows:
1419
1420 - data.xml
1421 - header.xml
1422 + base
1423 - tokens.xml
1424 - ...
1425 + struct
1426 - structure.xml
1427 - ...
1428 + corenlp
1429 - morpho.xml
1430 - constituency.xml
1431 - ...
1432 + tree_tagger
1433 - morpho.xml
1434 - ...
1435 - ...
1436
1437The C<data.xml> contains the primary data, the C<header.xml> contains
1438the metadata, and the annotation layers are stored in subfolders
1439like C<base>, C<struct> or C<corenlp>
1440(so-called "foundries"; Bański et al. 2013).
1441
1442Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001443(Lüngen and Sperberg-McQueen 2012). See the documentation in
1444L<KorAP::XML::Meta::I5> for translatable fields.
1445
1446Annotations correspond to a variant of the TEI-P5 feature structures
1447(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001448Annotation feature structures refer to character sequences of the primary text
1449inside the C<text> element of the C<data.xml>.
1450A single annotation containing the lemma of a token can have the following structure:
1451
1452 <span from="0" to="3">
1453 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1454 <f name="lex">
1455 <fs>
1456 <f name="lemma">zum</f>
1457 </fs>
1458 </f>
1459 </fs>
1460 </span>
1461
1462The C<from> and C<to> attributes are refering to the character span
1463in the primary text.
1464Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1465the structure may vary. See L<KorAP::XML::Annotation::*> for various
1466annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001467
1468Multiple KorAP-XML documents are organized on three levels following
1469the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1470corpus E<gt> document E<gt> text. On each level metadata information
1471can be stored, that C<korapxml2krill> will merge to a single metadata
1472object per text. A corpus is therefore structured as follows:
1473
1474 + <corpus>
1475 - header.xml
1476 + <document>
1477 - header.xml
1478 + <text>
1479 - data.xml
1480 - header.xml
1481 - ...
1482 - ...
1483
1484A single text can be identified by the concatenation of
1485the corpus identifier, the document identifier and the text identifier.
1486This identifier is called the text sigle
1487(e.g. a text with the identifier C<18486> in the document C<060> in the
1488corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1489
1490These corpora are often stored in zip files, with which C<korapxml2krill>
1491can deal with. Corpora may also be split in multiple zip archives
1492(e.g. one zip file per foundry), which is also supported (see C<--input>).
1493
1494Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1495in form of a test suite.
1496The resulting JSON format merges all annotation layers
1497based on a single token stream.
1498
1499=head2 References
1500
1501Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1502KorAP data model: first approximation, December.
1503
1504Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1505"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1506Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1507L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1508
1509Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1510"Robust corpus architecture: a new look at virtual collections and data access",
1511Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1512L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1513
1514Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1515Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1516"Towards an international standard on featurestructure representation",
1517Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1518pp. 373-376.
1519L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1520
1521Harald Lüngen and C. M. Sperberg-McQueen (2012):
1522"A TEI P5 Document Grammar for the IDS Text Model",
1523Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1524L<PDF|https://journals.openedition.org/jtei/pdf/508>
1525
1526TEI Consortium, eds:
1527"Feature Structures",
1528Guidelines for Electronic Text Encoding and Interchange.
1529L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1530
Akron941c1a62016-02-23 17:41:41 +01001531=head1 AVAILABILITY
1532
1533 https://github.com/KorAP/KorAP-XML-Krill
1534
1535
1536=head1 COPYRIGHT AND LICENSE
1537
Akron8f69d632020-01-15 16:58:11 +01001538Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001539
Akron8f69d632020-01-15 16:58:11 +01001540Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001541
Akrona76d8352016-10-27 16:27:32 +02001542Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001543
1544L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1545Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001546L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001547member of the
Akronf1849aa2019-12-16 23:35:33 +01001548L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001549
1550This program is free software published under the
1551L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1552
1553=cut