blob: a6074f6bf6a3f0edcde2d76aacacf8567e2e71d4 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000012use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010013use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010014use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010015use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010018use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020019use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020020use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010021use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010022use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
Akronf1849aa2019-12-16 23:35:33 +0100146# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100149# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200150#
151# 2020/04/23
152# - Added support for Redewiedergabe-Korpus structure
153# annotations, based on sentence and paragraph milestones
154# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100155# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100156
Akronf8df2162020-08-07 15:03:39 +0200157our $LAST_CHANGE = '2020/08/07';
Akron941c1a62016-02-23 17:41:41 +0100158our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100159our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100160our $VERSION_MSG = <<"VERSION";
161Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
162VERSION
163
Akron941c1a62016-02-23 17:41:41 +0100164# Parse comand
165my $cmd;
166our @ARGV;
167if ($ARGV[0] && index($ARGV[0], '-') != 0) {
168 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100169};
Akron63f20d42017-04-10 23:40:29 +0200170my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100171
Akron5f51d422016-08-16 16:26:43 +0200172my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200173
174# Configuration hash
175my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100176
Akron941c1a62016-02-23 17:41:41 +0100177# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000178GetOptions(
Akron08385f62016-03-22 20:37:04 +0100179 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200180 'input-base|ib=s' => \($cfg{input_base}),
181 'output|o=s' => \($cfg{output}),
182 'overwrite|w' => \($cfg{overwrite}),
183 'meta|m=s' => \($cfg{meta}),
184 'token|t=s' => \($cfg{token}),
185 'base-sentences|bs=s' => \($cfg{base_sentences}),
186 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
187 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
188 'gzip|z' => \($cfg{gzip}),
189 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100190 'skip|s=s' => \@skip,
191 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200192 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200193 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200194 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200195 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100196 'primary|p!' => \(my $primary),
197 'pretty|y' => \(my $pretty),
Akronf8df2162020-08-07 15:03:39 +0200198 'jobs|j=i' => \($cfg{jobs}),
199 'koral|k=f' => \($cfg{koral}),
200 'to-tar' => \($cfg{to_tar}),
201 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
202 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
203 'sequential-extraction|se' => \($cfg{sequential_extraction}),
204 'cache-size|cs=s' => \($cfg{cache_size}),
205 'cache-delete|cd!' => \($cfg{cache_delete}),
206 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100207 'help|h' => sub {
208 pod2usage(
209 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200210 -verbose => 99,
211 -msg => $VERSION_MSG,
212 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100213 );
214 },
215 'version|v' => sub {
216 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200217 -verbose => 0,
218 -msg => $VERSION_MSG,
219 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100220 )
221 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000222);
223
Akrone512b7c2020-08-07 16:16:12 +0200224my %ERROR_HASH = (
225 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
226 -verbose => 99,
227 -msg => $VERSION_MSG,
228 -output => '-',
229 -exit => 1
230);
Akron63f20d42017-04-10 23:40:29 +0200231
Akronf8df2162020-08-07 15:03:39 +0200232# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200233if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200234 my %config;
235
Akronf8df2162020-08-07 15:03:39 +0200236 print "Reading config from $cfg_file\n";
237
Akron636aa112017-04-07 18:48:56 +0200238 Config::Simple->import_from($cfg_file, \%config);
239
Akronf8df2162020-08-07 15:03:39 +0200240 foreach (qw!output cache-size input-base token overwrite
241 meta base-sentences base-paragraphs base-pagebreaks
242 gzip to-tar log cache non-word-tokens
243 non-verbal-tokens sequential-extraction cache-init
244 koral extract-dir jobs!) {
245 my $underlined = $_ =~ tr/-/_/r;
246 if (!defined($cfg{$underlined}) && defined $config{$_}) {
247 $cfg{$underlined} = $config{$_};
248 };
Akron636aa112017-04-07 18:48:56 +0200249 };
250
251 # Skip
252 if (!scalar(@skip) && defined $config{'skip'}) {
253 @skip = split /\s*;\s*/, $config{'skip'} ;
254 };
255
256 # Sigle
257 if (!scalar(@sigle) && defined $config{'sigle'}) {
258 @sigle = split /\s*;\s*/, $config{'sigle'} ;
259 };
260
261 # Anno
262 if (!scalar(@anno) && defined $config{'anno'}) {
263 @anno = split /\s*;\s*/, $config{'anno'} ;
264 };
265};
266
Akronf8df2162020-08-07 15:03:39 +0200267# Init variables and set default values
268my $output = $cfg{output};
269my $input_base = $cfg{input_base};
270my $gzip = $cfg{gzip};
271my $to_tar = $cfg{to_tar};
272my $extract_dir = $cfg{extract_dir};
273my $token_base = $cfg{token} // 'OpenNLP#tokens';
274my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
275my $jobs = $cfg{jobs} // 0;
276my $cache_delete = $cfg{cache_delete} // 1;
277my $base_sentences = lc($cfg{base_sentences} // '');
278my $base_paragraphs = lc($cfg{base_paragraphs} // '');
279my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
280my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200281
Akronf8df2162020-08-07 15:03:39 +0200282# Get tokenization basis
283my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200284
Akronf8df2162020-08-07 15:03:39 +0200285# Remove file extension
286$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100287
Akronf8df2162020-08-07 15:03:39 +0200288# Convert sigle to path construct
289s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
290
291my %skip;
292$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200293
294# Initialize log4perl object
295Log::Log4perl->init({
Akronf8df2162020-08-07 15:03:39 +0200296 'log4perl.rootLogger' => uc($cfg{log} // 'ERROR') . ', STDERR',
Akron63f20d42017-04-10 23:40:29 +0200297 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
298 'log4perl.appender.STDERR.layout' => 'PatternLayout',
299 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
300});
301
302my $log = Log::Log4perl->get_logger('main');
303
Akronf8df2162020-08-07 15:03:39 +0200304if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
305 $log->error("Directory '$output' does not exist.");
306 exit 1;
307};
Akron63f20d42017-04-10 23:40:29 +0200308
Akron941c1a62016-02-23 17:41:41 +0100309# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100310pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000311
Akrone1dbc382016-07-08 22:24:52 +0200312# Gzip has no effect, if no output is given
313pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000314
Akronc11f7982017-02-21 21:20:14 +0100315
Akron63f20d42017-04-10 23:40:29 +0200316# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200317if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200318
Akron63f20d42017-04-10 23:40:29 +0200319 # Remove all inputs
320 my $remove_next = 0;
321 @keep_argv = @{c(@keep_argv)->grep(
322 sub {
323 # Input flag
324 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
325 $remove_next = 1;
326 return 0;
327 }
328
329 # input value
330 elsif ($remove_next) {
331 $remove_next = 0;
332 return 0;
333 };
334
335 # Pass parameter
336 return 1;
337 }
338 )->to_array};
339
340
341 # Iterate over all inputs
342 foreach (@input) {
343
Akron081639e2017-04-21 19:01:39 +0200344 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200345 my $new_out = catdir($output, get_file_name_from_glob($_));
346
Akron486f9ab2017-04-22 23:25:19 +0200347 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200348 unless ($to_tar) {
349 if (make_path($new_out) == 0 && !-d $new_out) {
350 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200351 exit 1;
Akron081639e2017-04-21 19:01:39 +0200352 };
Akron63f20d42017-04-10 23:40:29 +0200353 };
354
355 # Create archive command
356 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
357 print "Start serial processing of $_ to $new_out\n";
358
359 # Start archiving
360 system @archive_cmd;
361 };
362
Akron3abc03e2017-06-29 16:23:35 +0200363 exit;
Akron63f20d42017-04-10 23:40:29 +0200364};
365
Akrone512b7c2020-08-07 16:16:12 +0200366
Akrone1dbc382016-07-08 22:24:52 +0200367my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100368push(@layers, ['Base', 'Sentences']) unless $base_sentences;
369push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200370
371# Connexor
372push(@layers, ['Connexor', 'Morpho']);
373push(@layers, ['Connexor', 'Syntax']);
374push(@layers, ['Connexor', 'Phrase']);
375push(@layers, ['Connexor', 'Sentences']);
376
377# CoreNLP
378push(@layers, ['CoreNLP', 'NamedEntities']);
379push(@layers, ['CoreNLP', 'Sentences']);
380push(@layers, ['CoreNLP', 'Morpho']);
381push(@layers, ['CoreNLP', 'Constituency']);
382
Akronce125b62017-06-19 11:54:36 +0200383# CMC
384push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100385
Akrone1dbc382016-07-08 22:24:52 +0200386# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100387my @dereko_attr = ();
388if ($base_sentences eq 'dereko#structure') {
389 push @dereko_attr, 'sentences';
390};
391if ($base_paragraphs eq 'dereko#structure') {
392 push @dereko_attr, 'paragraphs';
393};
Akron636bd9c2017-02-09 17:13:00 +0100394
Akron41ac10b2017-02-08 22:47:25 +0100395if ($base_pagebreaks eq 'dereko#structure') {
396 push @dereko_attr, 'pagebreaks';
397};
398
399if ($dereko_attr[0]) {
400 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100401}
402else {
403 push(@layers, ['DeReKo', 'Structure']);
404};
Akrone1dbc382016-07-08 22:24:52 +0200405
Akron57510c12019-01-04 14:58:53 +0100406# DGD
407push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100408if ($base_sentences eq 'dgd#structure') {
409 push(@layers, ['DGD', 'Structure', 'base-sentence']);
410}
Akron57510c12019-01-04 14:58:53 +0100411
412# DRuKoLa
413push(@layers, ['DRuKoLa', 'Morpho']);
414
Akrone1dbc382016-07-08 22:24:52 +0200415# Glemm
416push(@layers, ['Glemm', 'Morpho']);
417
Akronea1aed52018-07-19 14:43:34 +0200418# HNC
419push(@layers, ['HNC', 'Morpho']);
420
Akron4c679192018-01-16 17:41:49 +0100421# LWC
422push(@layers, ['LWC', 'Dependency']);
423
Akrone1dbc382016-07-08 22:24:52 +0200424# Malt
425push(@layers, ['Malt', 'Dependency']);
426
Akron57510c12019-01-04 14:58:53 +0100427# Marmot
428push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200429
430# Mate
431push(@layers, ['Mate', 'Morpho']);
432push(@layers, ['Mate', 'Dependency']);
433
Akron57510c12019-01-04 14:58:53 +0100434# MDParser
435push(@layers, ['MDParser', 'Dependency']);
436
Akrone1dbc382016-07-08 22:24:52 +0200437# OpenNLP
438push(@layers, ['OpenNLP', 'Morpho']);
439push(@layers, ['OpenNLP', 'Sentences']);
440
Akron07e24772020-04-23 14:00:54 +0200441# Redewiedergabe
442push(@layers, ['RWK', 'Morpho']);
443if ($base_sentences eq 'rwk#structure') {
444 push(@layers, ['RWK', 'Structure']);
445};
446
Akrone1dbc382016-07-08 22:24:52 +0200447# Schreibgebrauch
448push(@layers, ['Sgbr', 'Lemma']);
449push(@layers, ['Sgbr', 'Morpho']);
450
Akron7d5e6382019-08-08 16:36:27 +0200451# Talismane
452push(@layers, ['Talismane', 'Dependency']);
453push(@layers, ['Talismane', 'Morpho']);
454
Akrone1dbc382016-07-08 22:24:52 +0200455# TreeTagger
456push(@layers, ['TreeTagger', 'Morpho']);
457push(@layers, ['TreeTagger', 'Sentences']);
458
459# XIP
460push(@layers, ['XIP', 'Morpho']);
461push(@layers, ['XIP', 'Constituency']);
462push(@layers, ['XIP', 'Sentences']);
463push(@layers, ['XIP', 'Dependency']);
464
Akron4fa37c32017-01-20 14:43:10 +0100465
Akrone1dbc382016-07-08 22:24:52 +0200466# Check filters
467my @filtered_anno;
468if ($skip{'#all'}) {
469 foreach (@anno) {
470 push @filtered_anno, [ split('#', $_) ];
471 };
472}
473
474# Add all annotations that are not skipped
475else {
476 # Add to index file - respect skipping
477 foreach my $info (@layers) {
478 # Skip if Foundry or Foundry#Layer should be skipped
479 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
480 push @filtered_anno, $info;
481 };
482 };
483};
484
Akrone1dbc382016-07-08 22:24:52 +0200485
486# TODO: This should not be initialized for batch
487my $cache = Cache::FastMmap->new(
488 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200489 cache_size => ($cfg{cache_size} // '50m'),
490 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200491);
492
Akron03b24db2016-08-16 20:54:32 +0200493# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200494my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200495 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200496 meta_type => $cfg{meta},
497 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200498 foundry => $token_base_foundry,
499 layer => $token_base_layer,
500 gzip => $gzip,
501 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200502 koral => ($cfg{koral} // $KORAL_VERSION),
Akron03b24db2016-08-16 20:54:32 +0200503 primary => $primary,
504 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100505 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200506 non_word_tokens => ($cfg{non_word_tokens} // 0),
507 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200508);
509
Akrone512b7c2020-08-07 16:16:12 +0200510
511# Auto adjust jobs
512if ($jobs eq '-1') {
513 my $cores = Sys::Info->new->device('CPU')->count;
514 $jobs = ceil(5 * $cores);
515 $log->info("Run using $jobs jobs on $cores cores");
516};
517
518
Akron63f20d42017-04-10 23:40:29 +0200519# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200520if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200521
Akron821db3d2017-04-06 21:19:31 +0200522 my @new_input = ();
523
524 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200525 foreach my $wild_card (@input) {
526
527 # Prefix with input root
528 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
529
530 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200531 };
532
Akron63f20d42017-04-10 23:40:29 +0200533 # Sort files by length
534 @input = sort { length($a) <=> length($b) } @new_input;
535
536 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200537};
538
539
Akron941c1a62016-02-23 17:41:41 +0100540# Process a single file
541unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100542 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000543
Akron941c1a62016-02-23 17:41:41 +0100544 BEGIN {
545 $main::TIME = Benchmark->new;
546 $main::LAST_STOP = Benchmark->new;
547 };
548
549 sub stop_time {
550 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200551 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100552 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200553 timestr(timediff($new, $main::LAST_STOP)) .
554 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
555 );
Akron941c1a62016-02-23 17:41:41 +0100556 $main::LAST_STOP = $new;
557 };
558
559 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200560 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100561
Akron7d4cdd82016-08-17 21:39:45 +0200562 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200563 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100564
Akron11c80302016-03-18 19:44:43 +0100565 # Delete cache file
566 unlink($cache_file) if $cache_delete;
567
Akron5f51d422016-08-16 16:26:43 +0200568 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200569 exit;
Akron81500102017-04-07 20:45:44 +0200570};
571
Nils Diewald59094f22014-11-05 18:20:50 +0000572
Akrone10ad322016-02-27 10:54:26 +0100573# Extract XML files
Akron81500102017-04-07 20:45:44 +0200574if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100575
Akrond5643ad2017-07-04 20:27:13 +0200576 # Output is required
577 pod2usage(%ERROR_HASH) unless $output;
578
Akron7d4cdd82016-08-17 21:39:45 +0200579 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200580 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100581
Akron7d4cdd82016-08-17 21:39:45 +0200582 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100583 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200584 $log->error("Unzip is not installed or incompatible.");
585 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100586 };
587
Akronb0c88db2016-06-29 16:33:18 +0200588 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200589 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200590
Akron31a08cb2019-02-20 20:43:26 +0100591 # Will set @sigle
592 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200593
Akrone10ad322016-02-27 10:54:26 +0100594 # Iterate over all given sigles and extract
595 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100596
Akron2812ba22016-10-28 21:55:59 +0200597 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200598
Akron03b24db2016-08-16 20:54:32 +0200599 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200600 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100601
Akron955b75b2019-02-21 14:28:41 +0100602 # TODO:
603 # - prefix???
604 $archive->extract_sigle([$_], $output, $jobs)
605 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200606 );
Akrone10ad322016-02-27 10:54:26 +0100607 print "extracted.\n";
608 };
Akronb0c88db2016-06-29 16:33:18 +0200609 }
Akron7d4cdd82016-08-17 21:39:45 +0200610
611 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200612 else {
613 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200614 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100615 };
616}
617
Akron81500102017-04-07 20:45:44 +0200618
Akron941c1a62016-02-23 17:41:41 +0100619# Process an archive
620elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000621
Akron81500102017-04-07 20:45:44 +0200622 my $archive_output;
623
624 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100625 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200626
627 # Create new archive object
628 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
629
630 # Check zip capabilities
631 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200632 $log->error("Unzip is not installed or incompatible.");
633 exit 1;
Akron81500102017-04-07 20:45:44 +0200634 };
635
636 # Add further annotation archived
637 $archive->attach($_) foreach @input[1..$#input];
638
639 # Create a temporary directory
640 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200641 $extract_dir = tempdir(CLEANUP => 0);
642 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200643 };
644
Akron63f20d42017-04-10 23:40:29 +0200645 # Add some random extra to avoid clashes with multiple archives
646 $extract_dir = catdir($extract_dir, random_string('cccccc'));
647
Akron31a08cb2019-02-20 20:43:26 +0100648 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200649 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200650 @input = ($extract_dir);
651 }
652 else {
653 $log->error('Unable to extract from primary archive ' . $input[0] .
654 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200655 exit 1;
Akron81500102017-04-07 20:45:44 +0200656 };
657 }
658
659 # Can't create archive object
660 else {
661 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200662 exit 1;
Akron81500102017-04-07 20:45:44 +0200663 };
664 };
665
Akron7d4cdd82016-08-17 21:39:45 +0200666 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100667 my $pool = Parallel::ForkManager->new($jobs);
668
Akron7d4cdd82016-08-17 21:39:45 +0200669 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100670 my $iter = 1; # Current text in process
671
Akronda3097e2017-04-23 19:53:57 +0200672 my $tar_archive;
673 my $output_dir = $output;
674 my $tar_fh;
675
676 # Initialize tar archive
677 if ($to_tar) {
678 $tar_archive = Archive::Tar::Builder->new(
679 ignore_errors => 1
680 );
681
682 # Set output name
683 my $tar_file = $output;
684 unless ($tar_file =~ /\.tar$/) {
685 $tar_file .= '.tar';
686 };
687
688 # Initiate the tar file
689 print "Writing to file $tar_file\n";
690 $tar_fh = IO::File->new($tar_file, 'w');
691 $tar_fh->binmode(1);
692
693 # Set handle
694 $tar_archive->set_handle($tar_fh);
695
696 # Output to temporary directory
697 $output_dir = File::Temp->newdir;
698 };
699
Akron941c1a62016-02-23 17:41:41 +0100700 # Report on fork message
701 $pool->run_on_finish (
702 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200703 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100704 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200705
Akron08385f62016-03-22 20:37:04 +0100706 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200707 ($iter++) . "/$count]" .
708 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200709 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200710
711 if (!$code && $to_tar && $data->[2]) {
712 my $filename = $data->[2];
713
714 # Lock filehandle
715 if (flock($tar_fh, LOCK_EX)) {
716
Akron9a062ce2017-07-04 19:12:05 +0200717 my $clean_file = fileparse($filename);
718
Akronda3097e2017-04-23 19:53:57 +0200719 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200720 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200721 unlink $filename;
722
723 # Unlock filehandle
724 flock($tar_fh, LOCK_UN);
725 }
726 else {
727 $log->warn("Unable to add $filename to archive");
728 };
729 };
730
Akron4c0cf312016-10-15 16:42:09 +0200731 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100732 }
733 );
734
735 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200736 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100737 print "Reading data ...\n";
738
Akron7d4cdd82016-08-17 21:39:45 +0200739 # unless (Cache::FastMmap->new(
740 # share_file => $cache_file,
741 # cache_size => $cache_size,
742 # init_file => $cache_init
743 # )) {
744 # print "Unable to intialize cache '$cache_file'\n\n";
745 # exit(1);
746 # };
Akron11c80302016-03-18 19:44:43 +0100747
Akron486f9ab2017-04-22 23:25:19 +0200748
Akron941c1a62016-02-23 17:41:41 +0100749 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100750 if (-d $input[0]) {
751 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100752 my @dirs;
753 my $dir;
754
Akron7d4cdd82016-08-17 21:39:45 +0200755 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100756 while (1) {
757 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200758 push @dirs, $dir;
759 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100760 };
761 last unless $it->next;
762 };
763
764 print "Start processing ...\n";
765 $t = Benchmark->new;
766 $count = scalar @dirs;
767
768 DIRECTORY_LOOP:
769 for (my $i = 0; $i < $count; $i++) {
770
Akrone1dbc382016-07-08 22:24:52 +0200771 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200772 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200773 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200774 );
Akron941c1a62016-02-23 17:41:41 +0100775
776 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200777 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200778
Akron13d56622016-10-31 14:54:49 +0100779 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200780 $pool->finish(
781 0,
Akronda3097e2017-04-23 19:53:57 +0200782 [
783 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
784 undef,
785 $filename
786 ]
Akron486f9ab2017-04-22 23:25:19 +0200787 );
Akron3ec48972016-08-17 23:24:52 +0200788 }
789 else {
Akron4c0cf312016-10-15 16:42:09 +0200790 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200791 };
Akron941c1a62016-02-23 17:41:41 +0100792 };
793 }
794
795 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200796 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200797
Akron941c1a62016-02-23 17:41:41 +0100798 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200799 $log->error("Unzip is not installed or incompatible.");
800 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100801 };
802
Akron08385f62016-03-22 20:37:04 +0100803 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200804 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100805
Akron31a08cb2019-02-20 20:43:26 +0100806 # Get sigles to extract
807 my $prefix = set_sigle($archive);
808
Akron941c1a62016-02-23 17:41:41 +0100809 print "Start processing ...\n";
810 $t = Benchmark->new;
811 my @dirs = $archive->list_texts;
812 $count = scalar @dirs;
813
814 ARCHIVE_LOOP:
815 for (my $i = 0; $i < $count; $i++) {
816
817 # Split path information
818 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
819
Akrone1dbc382016-07-08 22:24:52 +0200820 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200821 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200822 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200823 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200824 catfile($corpus, $doc, $text)
825 . '.json' . ($gzip ? '.gz' : '')
826 )
Akrone1dbc382016-07-08 22:24:52 +0200827 );
Akron941c1a62016-02-23 17:41:41 +0100828
829 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200830 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100831
Akron4c0cf312016-10-15 16:42:09 +0200832 # Create temporary file
833 $temp = File::Temp->newdir;
834
Akronbdf434a2016-10-24 17:42:07 +0200835 # TODO: Check if $filename exist at the beginning,
836 # because extraction can be horrible slow!
837
Akron941c1a62016-02-23 17:41:41 +0100838 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100839 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100840
Akron7d4cdd82016-08-17 21:39:45 +0200841 # Create corpus directory
842 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100843
Akron7d4cdd82016-08-17 21:39:45 +0200844 # Temporary directory
845 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100846
Akron7d4cdd82016-08-17 21:39:45 +0200847 # Write file
Akron13d56622016-10-31 14:54:49 +0100848 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200849
Akron4c0cf312016-10-15 16:42:09 +0200850 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100851 $pool->finish(
852 0,
Akronda3097e2017-04-23 19:53:57 +0200853 [
854 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
855 $temp,
856 $filename
857 ]
Akron13d56622016-10-31 14:54:49 +0100858 );
Akron7d4cdd82016-08-17 21:39:45 +0200859 }
860 else {
Akron4c0cf312016-10-15 16:42:09 +0200861 # Delete temporary file
862 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200863 };
Akron941c1a62016-02-23 17:41:41 +0100864 }
Akron7d4cdd82016-08-17 21:39:45 +0200865
866 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100867 else {
Akron4c0cf312016-10-15 16:42:09 +0200868 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100869 };
870 };
871 }
872
873 else {
874 print "Input is neither a directory nor an archive.\n\n";
875 };
876
877 $pool->wait_all_children;
878
Akron11c80302016-03-18 19:44:43 +0100879 # Delete cache file
880 unlink($cache_file) if $cache_delete;
881
Akronda3097e2017-04-23 19:53:57 +0200882 # Close tar filehandle
883 if ($to_tar && $tar_fh) {
884 $tar_archive->finish;
885 $tar_fh->close;
886 print "Wrote to tar archive.\n";
887 };
888
Akron63f20d42017-04-10 23:40:29 +0200889 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100890 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200891};
Akron941c1a62016-02-23 17:41:41 +0100892
Nils Diewald2db9ad02013-10-29 19:26:43 +0000893
Akron31a08cb2019-02-20 20:43:26 +0100894# For an archive, this will create the list
895# of all sigles to process
896sub set_sigle {
897 my $archive = shift;
898
899 my $prefix = 1;
900 my @dirs = ();
901
902 # No sigles given
903 unless (@sigle) {
904
905 # Get files
906 foreach ($archive->list_texts) {
907
908 push @dirs, $_;
909
910 # Split path information
911 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
912
913 # TODO: Make this OS independent
914 push @sigle, join '/', $corpus, $doc, $text;
915 };
916 }
917
918 # Check sigle for doc sigles
919 else {
920 my @new_sigle;
921
922 my $prefix_check = 0;
923
924 # Iterate over all sigle
925 foreach (@sigle) {
926
927 # Sigle is a doc sigle
928 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
929
930 print "$_ ...";
931 # Check if a prefix is needed
932 unless ($prefix_check) {
933
934 if ($prefix = $archive->check_prefix) {
935 print " with prefix ...";
936 };
937 $prefix_check = 1;
938 };
939
940 print "\n";
941
Akron31a08cb2019-02-20 20:43:26 +0100942 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100943 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
944 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100945 );
946 print "extracted.\n";
947 }
948
949 # Sigle is a text sigle
950 else {
951 push @new_sigle, $_;
952
953 unless ($prefix_check) {
954
955 if ($prefix = $archive->check_prefix) {
956 print " with prefix ...";
957 };
958 $prefix_check = 1;
959 };
960 };
961 };
962 @sigle = @new_sigle;
963 };
964
965 return $prefix;
966};
967
968
Akron63f20d42017-04-10 23:40:29 +0200969# Cleanup temporary extraction directory
970if ($extract_dir) {
971 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +0200972 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +0200973};
974
975
976print "\n";
977
Nils Diewald2db9ad02013-10-29 19:26:43 +0000978__END__
Akron941c1a62016-02-23 17:41:41 +0100979
980=pod
981
982=encoding utf8
983
984=head1 NAME
985
Akron42f48c12020-02-14 13:08:13 +0100986korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100987
988
989=head1 SYNOPSIS
990
Akrona76d8352016-10-27 16:27:32 +0200991 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100992
Akron2fd402b2016-10-27 21:26:48 +0200993
Akron941c1a62016-02-23 17:41:41 +0100994=head1 DESCRIPTION
995
996L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
997compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +0100998The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +0100999
1000
1001=head1 INSTALLATION
1002
1003The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1004
Akronaf386982016-10-12 00:33:25 +02001005 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001006
Akronc13a1702016-03-15 19:33:14 +01001007In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001008be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001009Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001010In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001011
1012=head1 ARGUMENTS
1013
Akrona76d8352016-10-27 16:27:32 +02001014 $ korapxml2krill -z --input <directory> --output <filename>
1015
1016Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001017It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001018
Akron941c1a62016-02-23 17:41:41 +01001019=over 2
1020
1021=item B<archive>
1022
Akron081639e2017-04-21 19:01:39 +02001023 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001024
Akron2fd402b2016-10-27 21:26:48 +02001025Converts an archive of KorAP-XML documents. It expects a directory
1026(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001027
1028=item B<extract>
1029
Akrona76d8352016-10-27 16:27:32 +02001030 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1031
1032Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001033
Akron63f20d42017-04-10 23:40:29 +02001034=item B<serial>
1035
1036 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1037
1038Convert archives sequentially. The inputs are not merged but treated
1039as they are (so they may be premerged or globs).
1040the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001041are created based on the archive name. In case the C<--to-tar> flag is given,
1042the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001043
1044
Akron941c1a62016-02-23 17:41:41 +01001045=back
1046
1047
1048=head1 OPTIONS
1049
1050=over 2
1051
Akrona76d8352016-10-27 16:27:32 +02001052=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001053
Akrona76d8352016-10-27 16:27:32 +02001054Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001055
Akron7606afa2016-10-25 16:23:49 +02001056Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001057document, while C<archive> expects a KorAP-XML corpus folder or a zip
1058file to batch process multiple files.
1059C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001060
Akrona76d8352016-10-27 16:27:32 +02001061C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001062that the first archive listed contains all primary data files
1063and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001064
Akron7606afa2016-10-25 16:23:49 +02001065 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001066
Akron821db3d2017-04-06 21:19:31 +02001067Input may also be defined using BSD glob wildcards.
1068
1069 -i 'file/news*.zip'
1070
1071The extended input array will be sorted in length order, so the shortest
1072path needs to contain all primary data files and all meta data files.
1073
Akron0c3e3752016-06-28 15:55:53 +02001074(The directory structure follows the base directory format,
1075that may include a C<.> root folder.
1076In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001077need to be passed with a hash sign in front of the archive's name.
1078This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001079
Akron7606afa2016-10-25 16:23:49 +02001080To support zip files, a version of C<unzip> needs to be installed that is
1081compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001082
Akron7606afa2016-10-25 16:23:49 +02001083B<The root folder switch using the hash sign is experimental and
1084may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001085
Akronf73ffb62018-06-27 12:13:59 +02001086
Akron63f20d42017-04-10 23:40:29 +02001087=item B<--input-base|-ib> <directory>
1088
1089The base directory for inputs.
1090
1091
Akron941c1a62016-02-23 17:41:41 +01001092=item B<--output|-o> <directory|file>
1093
1094Output folder for archive processing or
1095document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001096writes to C<STDOUT> by default
1097(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001098
1099=item B<--overwrite|-w>
1100
1101Overwrite files that already exist.
1102
Akronf73ffb62018-06-27 12:13:59 +02001103
Akron3741f8b2016-12-21 19:55:21 +01001104=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001105
1106Define the default tokenization by specifying
1107the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001108of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001109This will directly take the file instead of running
1110the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001111
Akron3741f8b2016-12-21 19:55:21 +01001112
1113=item B<--base-sentences|-bs> <foundry>#<layer>
1114
1115Define the layer for base sentences.
1116If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001117Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1118layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001119
1120 Defaults to unset.
1121
1122
1123=item B<--base-paragraphs|-bp> <foundry>#<layer>
1124
1125Define the layer for base paragraphs.
1126If given, this will be used instead of using C<Base#Paragraphs>.
1127Currently C<DeReKo#Structure> is the only additional layer supported.
1128
1129 Defaults to unset.
1130
1131
Akron41ac10b2017-02-08 22:47:25 +01001132=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1133
1134Define the layer for base pagebreaks.
1135Currently C<DeReKo#Structure> is the only layer supported.
1136
1137 Defaults to unset.
1138
1139
Akron941c1a62016-02-23 17:41:41 +01001140=item B<--skip|-s> <foundry>[#<layer>]
1141
Akronf7ad89e2016-03-16 18:22:47 +01001142Skip specific annotations by specifying the foundry
1143(and optionally the layer with a C<#>-prefix),
1144e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001145Can be set multiple times.
1146
Akronf73ffb62018-06-27 12:13:59 +02001147
Akronc13a1702016-03-15 19:33:14 +01001148=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001149
Akronf7ad89e2016-03-16 18:22:47 +01001150Convert specific annotations by specifying the foundry
1151(and optionally the layer with a C<#>-prefix),
1152e.g. C<Mate> or C<Mate#Morpho>.
1153Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001154
Akronf73ffb62018-06-27 12:13:59 +02001155
Akron941c1a62016-02-23 17:41:41 +01001156=item B<--primary|-p>
1157
Akronc13a1702016-03-15 19:33:14 +01001158Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001159Can be flagged using C<--no-primary> as well.
1160This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001161
Akronf73ffb62018-06-27 12:13:59 +02001162
Akroned9baf02019-01-22 17:03:25 +01001163=item B<--non-word-tokens|-nwt>
1164
1165Tokenize non-word tokens like word tokens (defined as matching
1166C</[\d\w]/>). Useful to treat punctuations as tokens.
1167
1168 Defaults to unset.
1169
Akronf1849aa2019-12-16 23:35:33 +01001170
1171=item B<--non-verbal-tokens|-nvt>
1172
1173Tokenize non-verbal tokens marked as in the primary data as
1174the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1175
1176 Defaults to unset.
1177
1178
Akron941c1a62016-02-23 17:41:41 +01001179=item B<--jobs|-j>
1180
1181Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001182for archive processing.
Akron11c80302016-03-18 19:44:43 +01001183Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001184
1185If C<sequential-extraction> is not set to false, this will
1186also apply to extraction.
1187
Akronc11f7982017-02-21 21:20:14 +01001188Pass -1, and the value will be set automatically to 5
1189times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001190This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001191
Akronf73ffb62018-06-27 12:13:59 +02001192
Akron263274c2019-02-07 09:48:30 +01001193=item B<--koral|-k>
1194
1195Version of the output format. Supported versions are:
1196C<0> for legacy serialization, C<0.03> for serialization
1197with metadata fields as key-values on the root object,
1198C<0.4> for serialization with metadata fields as a list
1199of C<"@type":"koral:field"> objects.
1200
1201Currently defaults to C<0.03>.
1202
1203
Akron9ec88872017-04-12 16:29:06 +02001204=item B<--sequential-extraction|-se>
1205
1206Flag to indicate, if the C<jobs> value also applies to extraction.
1207Some systems may have problems with extracting multiple archives
1208to the same folder at the same time.
1209Can be flagged using C<--no-sequential-extraction> as well.
1210Defaults to C<false>.
1211
Akronf73ffb62018-06-27 12:13:59 +02001212
Akron35db6e32016-03-17 22:42:22 +01001213=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001214
Akron35db6e32016-03-17 22:42:22 +01001215Define the metadata parser to use. Defaults to C<I5>.
1216Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1217This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001218
Akronf73ffb62018-06-27 12:13:59 +02001219
Akron941c1a62016-02-23 17:41:41 +01001220=item B<--pretty|-y>
1221
Akronc13a1702016-03-15 19:33:14 +01001222Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001223This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001224
Akronf73ffb62018-06-27 12:13:59 +02001225
Akron941c1a62016-02-23 17:41:41 +01001226=item B<--gzip|-z>
1227
Akronf7ad89e2016-03-16 18:22:47 +01001228Compress the output.
1229Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001230
Akronf73ffb62018-06-27 12:13:59 +02001231
Akron11c80302016-03-18 19:44:43 +01001232=item B<--cache|-c>
1233
1234File to mmap a cache (using L<Cache::FastMmap>).
1235Defaults to C<korapxml2krill.cache> in the calling directory.
1236
Akronf73ffb62018-06-27 12:13:59 +02001237
Akron11c80302016-03-18 19:44:43 +01001238=item B<--cache-size|-cs>
1239
1240Size of the cache. Defaults to C<50m>.
1241
Akronf73ffb62018-06-27 12:13:59 +02001242
Akron11c80302016-03-18 19:44:43 +01001243=item B<--cache-init|-ci>
1244
1245Initialize cache file.
1246Can be flagged using C<--no-cache-init> as well.
1247Defaults to C<true>.
1248
Akronf73ffb62018-06-27 12:13:59 +02001249
Akron11c80302016-03-18 19:44:43 +01001250=item B<--cache-delete|-cd>
1251
1252Delete cache file after processing.
1253Can be flagged using C<--no-cache-delete> as well.
1254Defaults to C<true>.
1255
Akronf73ffb62018-06-27 12:13:59 +02001256
Akron636aa112017-04-07 18:48:56 +02001257=item B<--config|-cfg>
1258
1259Configure the parameters of your call in a file
1260of key-value pairs with whitespace separator
1261
1262 overwrite 1
1263 token DeReKo#Structure
1264 ...
1265
1266Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001267C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001268C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001269C<output>, C<koral>,
1270C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001271C<base-sentences>, C<base-paragraphs>,
1272C<base-pagebreaks>,
1273C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001274(semicolon separated), C<anno> (semicolon separated).
1275
Akronf73ffb62018-06-27 12:13:59 +02001276Configuration parameters will always be overwritten by
1277passed parameters.
1278
1279
Akron81500102017-04-07 20:45:44 +02001280=item B<--temporary-extract|-te>
1281
1282Only valid for the C<archive> command.
1283
1284This will first extract all files into a
1285directory and then will archive.
1286If the directory is given as C<:temp:>,
1287a temporary directory is used.
1288This is especially useful to avoid
1289massive unzipping and potential
1290network latency.
Akron636aa112017-04-07 18:48:56 +02001291
Akronf73ffb62018-06-27 12:13:59 +02001292
Akronc93a0802019-07-11 15:48:34 +02001293=item B<--to-tar>
1294
1295Only valid for the C<archive> command.
1296
1297Writes the output into a tar archive.
1298
1299
Akrone10ad322016-02-27 10:54:26 +01001300=item B<--sigle|-sg>
1301
Akron20807582016-10-26 17:11:34 +02001302Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001303Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001304I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001305Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001306In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001307On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001308
Akronf73ffb62018-06-27 12:13:59 +02001309
Akron941c1a62016-02-23 17:41:41 +01001310=item B<--log|-l>
1311
1312The L<Log4perl> log level, defaults to C<ERROR>.
1313
Akronf73ffb62018-06-27 12:13:59 +02001314
Akron941c1a62016-02-23 17:41:41 +01001315=item B<--help|-h>
1316
Akron42f48c12020-02-14 13:08:13 +01001317Print help information.
Akron941c1a62016-02-23 17:41:41 +01001318
Akronf73ffb62018-06-27 12:13:59 +02001319
Akron941c1a62016-02-23 17:41:41 +01001320=item B<--version|-v>
1321
1322Print version information.
1323
1324=back
1325
Akronf73ffb62018-06-27 12:13:59 +02001326
Akronc13a1702016-03-15 19:33:14 +01001327=head1 ANNOTATION SUPPORT
1328
1329L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1330developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1331The base foundry with paragraphs, sentences, and the text element are mandatory for
1332L<Krill|https://github.com/KorAP/Krill>.
1333
Akron821db3d2017-04-06 21:19:31 +02001334 Base
1335 #Paragraphs
1336 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001337
Akron821db3d2017-04-06 21:19:31 +02001338 Connexor
1339 #Morpho
1340 #Phrase
1341 #Sentences
1342 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001343
Akron821db3d2017-04-06 21:19:31 +02001344 CoreNLP
1345 #Constituency
1346 #Morpho
1347 #NamedEntities
1348 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001349
Akronce125b62017-06-19 11:54:36 +02001350 CMC
1351 #Morpho
1352
Akron821db3d2017-04-06 21:19:31 +02001353 DeReKo
1354 #Structure
Akronc13a1702016-03-15 19:33:14 +01001355
Akron57510c12019-01-04 14:58:53 +01001356 DGD
1357 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001358 #Structure
Akron57510c12019-01-04 14:58:53 +01001359
Akron821db3d2017-04-06 21:19:31 +02001360 DRuKoLa
1361 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001362
Akron821db3d2017-04-06 21:19:31 +02001363 Glemm
1364 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001365
Akronea1aed52018-07-19 14:43:34 +02001366 HNC
1367 #Morpho
1368
Akron4c679192018-01-16 17:41:49 +01001369 LWC
1370 #Dependency
1371
Akron821db3d2017-04-06 21:19:31 +02001372 Malt
1373 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001374
Akron821db3d2017-04-06 21:19:31 +02001375 MarMoT
1376 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001377
Akron821db3d2017-04-06 21:19:31 +02001378 Mate
1379 #Dependency
1380 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001381
Akron821db3d2017-04-06 21:19:31 +02001382 MDParser
1383 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001384
Akron821db3d2017-04-06 21:19:31 +02001385 OpenNLP
1386 #Morpho
1387 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001388
Akron07e24772020-04-23 14:00:54 +02001389 RWK
1390 #Morpho
1391 #Structure
1392
Akron821db3d2017-04-06 21:19:31 +02001393 Sgbr
1394 #Lemma
1395 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001396
Akron7d5e6382019-08-08 16:36:27 +02001397 Talismane
1398 #Dependency
1399 #Morpho
1400
Akron821db3d2017-04-06 21:19:31 +02001401 TreeTagger
1402 #Morpho
1403 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001404
Akron821db3d2017-04-06 21:19:31 +02001405 XIP
1406 #Constituency
1407 #Morpho
1408 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001409
Akronc13a1702016-03-15 19:33:14 +01001410
1411More importers are in preparation.
1412New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1413See the built-in annotation importers as examples.
1414
Akronf73ffb62018-06-27 12:13:59 +02001415
Akron8f69d632020-01-15 16:58:11 +01001416=head1 About KorAP-XML
1417
1418KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1419data model (Bański et al. 2013), where text data are stored physically
1420separated from their interpretations (i.e. annotations).
1421A text document in KorAP-XML therefore consists of several files
1422containing primary data, metadata and annotations.
1423
1424The structure of a single KorAP-XML document can be as follows:
1425
1426 - data.xml
1427 - header.xml
1428 + base
1429 - tokens.xml
1430 - ...
1431 + struct
1432 - structure.xml
1433 - ...
1434 + corenlp
1435 - morpho.xml
1436 - constituency.xml
1437 - ...
1438 + tree_tagger
1439 - morpho.xml
1440 - ...
1441 - ...
1442
1443The C<data.xml> contains the primary data, the C<header.xml> contains
1444the metadata, and the annotation layers are stored in subfolders
1445like C<base>, C<struct> or C<corenlp>
1446(so-called "foundries"; Bański et al. 2013).
1447
1448Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001449(Lüngen and Sperberg-McQueen 2012). See the documentation in
1450L<KorAP::XML::Meta::I5> for translatable fields.
1451
1452Annotations correspond to a variant of the TEI-P5 feature structures
1453(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001454Annotation feature structures refer to character sequences of the primary text
1455inside the C<text> element of the C<data.xml>.
1456A single annotation containing the lemma of a token can have the following structure:
1457
1458 <span from="0" to="3">
1459 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1460 <f name="lex">
1461 <fs>
1462 <f name="lemma">zum</f>
1463 </fs>
1464 </f>
1465 </fs>
1466 </span>
1467
1468The C<from> and C<to> attributes are refering to the character span
1469in the primary text.
1470Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1471the structure may vary. See L<KorAP::XML::Annotation::*> for various
1472annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001473
1474Multiple KorAP-XML documents are organized on three levels following
1475the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1476corpus E<gt> document E<gt> text. On each level metadata information
1477can be stored, that C<korapxml2krill> will merge to a single metadata
1478object per text. A corpus is therefore structured as follows:
1479
1480 + <corpus>
1481 - header.xml
1482 + <document>
1483 - header.xml
1484 + <text>
1485 - data.xml
1486 - header.xml
1487 - ...
1488 - ...
1489
1490A single text can be identified by the concatenation of
1491the corpus identifier, the document identifier and the text identifier.
1492This identifier is called the text sigle
1493(e.g. a text with the identifier C<18486> in the document C<060> in the
1494corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1495
1496These corpora are often stored in zip files, with which C<korapxml2krill>
1497can deal with. Corpora may also be split in multiple zip archives
1498(e.g. one zip file per foundry), which is also supported (see C<--input>).
1499
1500Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1501in form of a test suite.
1502The resulting JSON format merges all annotation layers
1503based on a single token stream.
1504
1505=head2 References
1506
1507Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1508KorAP data model: first approximation, December.
1509
1510Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1511"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1512Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1513L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1514
1515Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1516"Robust corpus architecture: a new look at virtual collections and data access",
1517Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1518L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1519
1520Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1521Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1522"Towards an international standard on featurestructure representation",
1523Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1524pp. 373-376.
1525L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1526
1527Harald Lüngen and C. M. Sperberg-McQueen (2012):
1528"A TEI P5 Document Grammar for the IDS Text Model",
1529Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1530L<PDF|https://journals.openedition.org/jtei/pdf/508>
1531
1532TEI Consortium, eds:
1533"Feature Structures",
1534Guidelines for Electronic Text Encoding and Interchange.
1535L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1536
Akron941c1a62016-02-23 17:41:41 +01001537=head1 AVAILABILITY
1538
1539 https://github.com/KorAP/KorAP-XML-Krill
1540
1541
1542=head1 COPYRIGHT AND LICENSE
1543
Akron8f69d632020-01-15 16:58:11 +01001544Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001545
Akron8f69d632020-01-15 16:58:11 +01001546Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001547
Akrona76d8352016-10-27 16:27:32 +02001548Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001549
1550L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1551Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001552L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001553member of the
Akronf1849aa2019-12-16 23:35:33 +01001554L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001555
1556This program is free software published under the
1557L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1558
1559=cut