blob: b2533bce5e4a137299244cfccfc187d1b18812b4 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010019use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020020use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020021use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010022use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020023use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020024use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020025use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020026use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020027use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020029use IO::File;
30use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron941c1a62016-02-23 17:41:41 +0100157# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100158
Akronabb36902021-10-11 15:51:06 +0200159our $LAST_CHANGE = '2021/10/11';
Akron941c1a62016-02-23 17:41:41 +0100160our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100161our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100162our $VERSION_MSG = <<"VERSION";
163Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
164VERSION
165
Akron941c1a62016-02-23 17:41:41 +0100166# Parse comand
167my $cmd;
168our @ARGV;
169if ($ARGV[0] && index($ARGV[0], '-') != 0) {
170 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100171};
Akron63f20d42017-04-10 23:40:29 +0200172my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100173
Akron5f51d422016-08-16 16:26:43 +0200174my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200175
176# Configuration hash
177my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100178
Akron941c1a62016-02-23 17:41:41 +0100179# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000180GetOptions(
Akron08385f62016-03-22 20:37:04 +0100181 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200182 'input-base|ib=s' => \($cfg{input_base}),
183 'output|o=s' => \($cfg{output}),
184 'overwrite|w' => \($cfg{overwrite}),
185 'meta|m=s' => \($cfg{meta}),
186 'token|t=s' => \($cfg{token}),
187 'base-sentences|bs=s' => \($cfg{base_sentences}),
188 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
189 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
190 'gzip|z' => \($cfg{gzip}),
191 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100192 'skip|s=s' => \@skip,
193 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200194 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200195 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200196 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200197 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200198 'primary|p!' => sub {
199 warn 'Primary flag no longer supported!';
200 },
Akron6aed0562020-08-07 16:46:00 +0200201 'pretty|y' => sub {
202 warn 'Pretty flag no longer supported!';
203 },
Akronf8df2162020-08-07 15:03:39 +0200204 'jobs|j=i' => \($cfg{jobs}),
205 'koral|k=f' => \($cfg{koral}),
206 'to-tar' => \($cfg{to_tar}),
207 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
208 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
209 'sequential-extraction|se' => \($cfg{sequential_extraction}),
210 'cache-size|cs=s' => \($cfg{cache_size}),
211 'cache-delete|cd!' => \($cfg{cache_delete}),
212 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100213 'help|h' => sub {
214 pod2usage(
215 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200216 -verbose => 99,
217 -msg => $VERSION_MSG,
218 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100219 );
220 },
221 'version|v' => sub {
222 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200223 -verbose => 0,
224 -msg => $VERSION_MSG,
225 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100226 )
227 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000228);
229
Akrone512b7c2020-08-07 16:16:12 +0200230my %ERROR_HASH = (
231 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
232 -verbose => 99,
233 -msg => $VERSION_MSG,
234 -output => '-',
235 -exit => 1
236);
Akron63f20d42017-04-10 23:40:29 +0200237
Akronf8df2162020-08-07 15:03:39 +0200238# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200239if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200240 my %config;
241
Akronf8df2162020-08-07 15:03:39 +0200242 print "Reading config from $cfg_file\n";
243
Akron636aa112017-04-07 18:48:56 +0200244 Config::Simple->import_from($cfg_file, \%config);
245
Akronf8df2162020-08-07 15:03:39 +0200246 foreach (qw!output cache-size input-base token overwrite
247 meta base-sentences base-paragraphs base-pagebreaks
248 gzip to-tar log cache non-word-tokens
249 non-verbal-tokens sequential-extraction cache-init
250 koral extract-dir jobs!) {
251 my $underlined = $_ =~ tr/-/_/r;
252 if (!defined($cfg{$underlined}) && defined $config{$_}) {
253 $cfg{$underlined} = $config{$_};
254 };
Akron636aa112017-04-07 18:48:56 +0200255 };
256
257 # Skip
258 if (!scalar(@skip) && defined $config{'skip'}) {
259 @skip = split /\s*;\s*/, $config{'skip'} ;
260 };
261
262 # Sigle
263 if (!scalar(@sigle) && defined $config{'sigle'}) {
264 @sigle = split /\s*;\s*/, $config{'sigle'} ;
265 };
266
267 # Anno
268 if (!scalar(@anno) && defined $config{'anno'}) {
269 @anno = split /\s*;\s*/, $config{'anno'} ;
270 };
271};
272
Akronf8df2162020-08-07 15:03:39 +0200273# Init variables and set default values
274my $output = $cfg{output};
275my $input_base = $cfg{input_base};
276my $gzip = $cfg{gzip};
277my $to_tar = $cfg{to_tar};
278my $extract_dir = $cfg{extract_dir};
279my $token_base = $cfg{token} // 'OpenNLP#tokens';
280my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
281my $jobs = $cfg{jobs} // 0;
282my $cache_delete = $cfg{cache_delete} // 1;
283my $base_sentences = lc($cfg{base_sentences} // '');
284my $base_paragraphs = lc($cfg{base_paragraphs} // '');
285my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
286my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200287
Akronf8df2162020-08-07 15:03:39 +0200288# Get tokenization basis
289my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200290
Akronf8df2162020-08-07 15:03:39 +0200291# Remove file extension
292$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100293
Akronf8df2162020-08-07 15:03:39 +0200294# Convert sigle to path construct
295s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
296
297my %skip;
298$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200299
Akronb9c33812020-10-21 16:19:35 +0200300Log::Any::Adapter->set(
301 'Stderr', log_level => uc($cfg{log} // 'ERROR')
302);
Akron63f20d42017-04-10 23:40:29 +0200303
Akronf8df2162020-08-07 15:03:39 +0200304if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
305 $log->error("Directory '$output' does not exist.");
306 exit 1;
307};
Akron63f20d42017-04-10 23:40:29 +0200308
Akron941c1a62016-02-23 17:41:41 +0100309# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100310pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000311
Akrone1dbc382016-07-08 22:24:52 +0200312# Gzip has no effect, if no output is given
313pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000314
Akronc11f7982017-02-21 21:20:14 +0100315
Akron63f20d42017-04-10 23:40:29 +0200316# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200317if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200318
Akron63f20d42017-04-10 23:40:29 +0200319 # Remove all inputs
320 my $remove_next = 0;
321 @keep_argv = @{c(@keep_argv)->grep(
322 sub {
323 # Input flag
324 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
325 $remove_next = 1;
326 return 0;
327 }
328
329 # input value
330 elsif ($remove_next) {
331 $remove_next = 0;
332 return 0;
333 };
334
335 # Pass parameter
336 return 1;
337 }
338 )->to_array};
339
340
341 # Iterate over all inputs
342 foreach (@input) {
343
Akron081639e2017-04-21 19:01:39 +0200344 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200345 my $new_out = catdir($output, get_file_name_from_glob($_));
346
Akron486f9ab2017-04-22 23:25:19 +0200347 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200348 unless ($to_tar) {
349 if (make_path($new_out) == 0 && !-d $new_out) {
350 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200351 exit 1;
Akron081639e2017-04-21 19:01:39 +0200352 };
Akron63f20d42017-04-10 23:40:29 +0200353 };
354
355 # Create archive command
356 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
357 print "Start serial processing of $_ to $new_out\n";
358
359 # Start archiving
360 system @archive_cmd;
361 };
362
Akron3abc03e2017-06-29 16:23:35 +0200363 exit;
Akron63f20d42017-04-10 23:40:29 +0200364};
365
Akron5c602cb2020-08-07 17:00:52 +0200366# Define supported (and preinstalled) transformation modules
367my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100368push(@layers, ['Base', 'Sentences']) unless $base_sentences;
369push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200370
371# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200372push(@layers, ['Connexor', 'Morpho'],
373 ['Connexor', 'Syntax'],
374 ['Connexor', 'Phrase'],
375 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200376
377# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200378push(@layers,
379 ['CoreNLP', 'NamedEntities'],
380 ['CoreNLP', 'Sentences'],
381 ['CoreNLP', 'Morpho'],
382 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200383
Akronce125b62017-06-19 11:54:36 +0200384# CMC
385push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100386
Akrone1dbc382016-07-08 22:24:52 +0200387# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100388my @dereko_attr = ();
389if ($base_sentences eq 'dereko#structure') {
390 push @dereko_attr, 'sentences';
391};
392if ($base_paragraphs eq 'dereko#structure') {
393 push @dereko_attr, 'paragraphs';
394};
Akron636bd9c2017-02-09 17:13:00 +0100395
Akron41ac10b2017-02-08 22:47:25 +0100396if ($base_pagebreaks eq 'dereko#structure') {
397 push @dereko_attr, 'pagebreaks';
398};
399
400if ($dereko_attr[0]) {
401 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100402}
403else {
404 push(@layers, ['DeReKo', 'Structure']);
405};
Akrone1dbc382016-07-08 22:24:52 +0200406
Akron57510c12019-01-04 14:58:53 +0100407# DGD
408push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100409if ($base_sentences eq 'dgd#structure') {
410 push(@layers, ['DGD', 'Structure', 'base-sentence']);
411}
Akron57510c12019-01-04 14:58:53 +0100412
413# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200414push(@layers,
415 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100416
Akronabb36902021-10-11 15:51:06 +0200417# Gingko
418push(@layers,
419 ['Gingko', 'Morpho']);
420
Akrone1dbc382016-07-08 22:24:52 +0200421# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200422push(@layers,
423 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200424
Akronea1aed52018-07-19 14:43:34 +0200425# HNC
Akron5c602cb2020-08-07 17:00:52 +0200426push(@layers,
427 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200428
Akron4c679192018-01-16 17:41:49 +0100429# LWC
Akron5c602cb2020-08-07 17:00:52 +0200430push(@layers,
431 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100432
Akrone1dbc382016-07-08 22:24:52 +0200433# Malt
Akron5c602cb2020-08-07 17:00:52 +0200434push(@layers,
435 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200436
Akron57510c12019-01-04 14:58:53 +0100437# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200438push(@layers,
439 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200440
441# Mate
Akron5c602cb2020-08-07 17:00:52 +0200442push(@layers,
443 ['Mate', 'Morpho'],
444 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200445
Akron57510c12019-01-04 14:58:53 +0100446# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200447push(@layers,
448 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100449
Akrone1dbc382016-07-08 22:24:52 +0200450# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200451push(@layers,
452 ['OpenNLP', 'Morpho'],
453 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200454
Akron07e24772020-04-23 14:00:54 +0200455# Redewiedergabe
456push(@layers, ['RWK', 'Morpho']);
457if ($base_sentences eq 'rwk#structure') {
458 push(@layers, ['RWK', 'Structure']);
459};
460
Akrone1dbc382016-07-08 22:24:52 +0200461# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200462push(@layers,
463 ['Sgbr', 'Lemma'],
464 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200465
Akron7d5e6382019-08-08 16:36:27 +0200466# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200467push(@layers,
468 ['Talismane', 'Dependency'],
469 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200470
Akrone1dbc382016-07-08 22:24:52 +0200471# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200472push(@layers,
473 ['TreeTagger', 'Morpho'],
474 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200475
476# XIP
Akron5c602cb2020-08-07 17:00:52 +0200477push(@layers,
478 ['XIP', 'Morpho'],
479 ['XIP', 'Constituency'],
480 ['XIP', 'Sentences'],
481 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200482
Akron4fa37c32017-01-20 14:43:10 +0100483
Akrone1dbc382016-07-08 22:24:52 +0200484# Check filters
485my @filtered_anno;
486if ($skip{'#all'}) {
487 foreach (@anno) {
488 push @filtered_anno, [ split('#', $_) ];
489 };
490}
491
492# Add all annotations that are not skipped
493else {
494 # Add to index file - respect skipping
495 foreach my $info (@layers) {
496 # Skip if Foundry or Foundry#Layer should be skipped
497 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
498 push @filtered_anno, $info;
499 };
500 };
501};
502
Akrone1dbc382016-07-08 22:24:52 +0200503
504# TODO: This should not be initialized for batch
505my $cache = Cache::FastMmap->new(
506 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200507 cache_size => ($cfg{cache_size} // '50m'),
508 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200509);
510
Akron03b24db2016-08-16 20:54:32 +0200511# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200512my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200513 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200514 meta_type => $cfg{meta},
515 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200516 foundry => $token_base_foundry,
517 layer => $token_base_layer,
518 gzip => $gzip,
519 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200520 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100521 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200522 non_word_tokens => ($cfg{non_word_tokens} // 0),
523 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200524);
525
Akrone512b7c2020-08-07 16:16:12 +0200526
527# Auto adjust jobs
528if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100529 my $cores = 1;
530 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
531 $cores = Sys::Info->new->device('CPU')->count;
532 }
533 else {
534 $log->warn("Unable to determine number of cores");
535 };
536
Akrone512b7c2020-08-07 16:16:12 +0200537 $jobs = ceil(5 * $cores);
538 $log->info("Run using $jobs jobs on $cores cores");
539};
540
541
Akron63f20d42017-04-10 23:40:29 +0200542# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200543if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200544
Akron821db3d2017-04-06 21:19:31 +0200545 my @new_input = ();
546
547 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200548 foreach my $wild_card (@input) {
549
550 # Prefix with input root
551 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
552
553 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200554 };
555
Akron63f20d42017-04-10 23:40:29 +0200556 # Sort files by length
557 @input = sort { length($a) <=> length($b) } @new_input;
558
559 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200560};
561
562
Akron941c1a62016-02-23 17:41:41 +0100563# Process a single file
564unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100565 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000566
Akron941c1a62016-02-23 17:41:41 +0100567 BEGIN {
568 $main::TIME = Benchmark->new;
569 $main::LAST_STOP = Benchmark->new;
570 };
571
572 sub stop_time {
573 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200574 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100575 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200576 timestr(timediff($new, $main::LAST_STOP)) .
577 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
578 );
Akron941c1a62016-02-23 17:41:41 +0100579 $main::LAST_STOP = $new;
580 };
581
582 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200583 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100584
Akron7d4cdd82016-08-17 21:39:45 +0200585 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200586 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100587
Akron11c80302016-03-18 19:44:43 +0100588 # Delete cache file
589 unlink($cache_file) if $cache_delete;
590
Akron5f51d422016-08-16 16:26:43 +0200591 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200592 exit;
Akron81500102017-04-07 20:45:44 +0200593};
594
Nils Diewald59094f22014-11-05 18:20:50 +0000595
Akrone10ad322016-02-27 10:54:26 +0100596# Extract XML files
Akron81500102017-04-07 20:45:44 +0200597if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100598
Akrond5643ad2017-07-04 20:27:13 +0200599 # Output is required
600 pod2usage(%ERROR_HASH) unless $output;
601
Akron7d4cdd82016-08-17 21:39:45 +0200602 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200603 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100604
Akron7d4cdd82016-08-17 21:39:45 +0200605 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100606 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200607 $log->error("Unzip is not installed or incompatible.");
608 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100609 };
610
Akronb0c88db2016-06-29 16:33:18 +0200611 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200612 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200613
Akron31a08cb2019-02-20 20:43:26 +0100614 # Will set @sigle
615 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200616
Akrone10ad322016-02-27 10:54:26 +0100617 # Iterate over all given sigles and extract
618 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100619
Akron2812ba22016-10-28 21:55:59 +0200620 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200621
Akron03b24db2016-08-16 20:54:32 +0200622 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200623 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100624
Akron955b75b2019-02-21 14:28:41 +0100625 # TODO:
626 # - prefix???
627 $archive->extract_sigle([$_], $output, $jobs)
628 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200629 );
Akrone10ad322016-02-27 10:54:26 +0100630 print "extracted.\n";
631 };
Akronb0c88db2016-06-29 16:33:18 +0200632 }
Akron7d4cdd82016-08-17 21:39:45 +0200633
634 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200635 else {
636 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200637 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100638 };
639}
640
Akron81500102017-04-07 20:45:44 +0200641
Akron941c1a62016-02-23 17:41:41 +0100642# Process an archive
643elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000644
Akron81500102017-04-07 20:45:44 +0200645 my $archive_output;
646
647 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100648 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200649
650 # Create new archive object
651 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
652
653 # Check zip capabilities
654 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200655 $log->error("Unzip is not installed or incompatible.");
656 exit 1;
Akron81500102017-04-07 20:45:44 +0200657 };
658
659 # Add further annotation archived
660 $archive->attach($_) foreach @input[1..$#input];
661
662 # Create a temporary directory
663 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200664 $extract_dir = tempdir(CLEANUP => 0);
665 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200666 };
667
Akron63f20d42017-04-10 23:40:29 +0200668 # Add some random extra to avoid clashes with multiple archives
669 $extract_dir = catdir($extract_dir, random_string('cccccc'));
670
Akron31a08cb2019-02-20 20:43:26 +0100671 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200672 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200673 @input = ($extract_dir);
674 }
675 else {
676 $log->error('Unable to extract from primary archive ' . $input[0] .
677 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200678 exit 1;
Akron81500102017-04-07 20:45:44 +0200679 };
680 }
681
682 # Can't create archive object
683 else {
684 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200685 exit 1;
Akron81500102017-04-07 20:45:44 +0200686 };
687 };
688
Akron7d4cdd82016-08-17 21:39:45 +0200689 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100690 my $pool = Parallel::ForkManager->new($jobs);
691
Akron7d4cdd82016-08-17 21:39:45 +0200692 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100693 my $iter = 1; # Current text in process
694
Akronda3097e2017-04-23 19:53:57 +0200695 my $tar_archive;
696 my $output_dir = $output;
697 my $tar_fh;
698
699 # Initialize tar archive
700 if ($to_tar) {
701 $tar_archive = Archive::Tar::Builder->new(
702 ignore_errors => 1
703 );
704
705 # Set output name
706 my $tar_file = $output;
707 unless ($tar_file =~ /\.tar$/) {
708 $tar_file .= '.tar';
709 };
710
711 # Initiate the tar file
712 print "Writing to file $tar_file\n";
713 $tar_fh = IO::File->new($tar_file, 'w');
714 $tar_fh->binmode(1);
715
716 # Set handle
717 $tar_archive->set_handle($tar_fh);
718
719 # Output to temporary directory
720 $output_dir = File::Temp->newdir;
721 };
722
Akron941c1a62016-02-23 17:41:41 +0100723 # Report on fork message
724 $pool->run_on_finish (
725 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200726 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100727 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200728
Akron08385f62016-03-22 20:37:04 +0100729 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200730 ($iter++) . "/$count]" .
731 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200732 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200733
734 if (!$code && $to_tar && $data->[2]) {
735 my $filename = $data->[2];
736
737 # Lock filehandle
738 if (flock($tar_fh, LOCK_EX)) {
739
Akron9a062ce2017-07-04 19:12:05 +0200740 my $clean_file = fileparse($filename);
741
Akronda3097e2017-04-23 19:53:57 +0200742 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200743 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200744 unlink $filename;
745
746 # Unlock filehandle
747 flock($tar_fh, LOCK_UN);
748 }
749 else {
750 $log->warn("Unable to add $filename to archive");
751 };
752 };
753
Akron4c0cf312016-10-15 16:42:09 +0200754 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100755 }
756 );
757
758 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200759 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100760 print "Reading data ...\n";
761
Akron7d4cdd82016-08-17 21:39:45 +0200762 # unless (Cache::FastMmap->new(
763 # share_file => $cache_file,
764 # cache_size => $cache_size,
765 # init_file => $cache_init
766 # )) {
767 # print "Unable to intialize cache '$cache_file'\n\n";
768 # exit(1);
769 # };
Akron11c80302016-03-18 19:44:43 +0100770
Akron486f9ab2017-04-22 23:25:19 +0200771
Akron941c1a62016-02-23 17:41:41 +0100772 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100773 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200774 # TODO:
775 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100776 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100777 my @dirs;
778 my $dir;
779
Akron7d4cdd82016-08-17 21:39:45 +0200780 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100781 while (1) {
782 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200783 push @dirs, $dir;
784 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100785 };
786 last unless $it->next;
787 };
788
789 print "Start processing ...\n";
790 $t = Benchmark->new;
791 $count = scalar @dirs;
792
793 DIRECTORY_LOOP:
794 for (my $i = 0; $i < $count; $i++) {
795
Akrone1dbc382016-07-08 22:24:52 +0200796 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200797 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200798 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200799 );
Akron941c1a62016-02-23 17:41:41 +0100800
801 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200802 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200803
Akron13d56622016-10-31 14:54:49 +0100804 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200805 $pool->finish(
806 0,
Akronda3097e2017-04-23 19:53:57 +0200807 [
808 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
809 undef,
810 $filename
811 ]
Akron486f9ab2017-04-22 23:25:19 +0200812 );
Akron3ec48972016-08-17 23:24:52 +0200813 }
814 else {
Akron4c0cf312016-10-15 16:42:09 +0200815 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200816 };
Akron941c1a62016-02-23 17:41:41 +0100817 };
818 }
819
820 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200821 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200822
Akron941c1a62016-02-23 17:41:41 +0100823 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200824 $log->error("Unzip is not installed or incompatible.");
825 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100826 };
827
Akron08385f62016-03-22 20:37:04 +0100828 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200829 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100830
Akron31a08cb2019-02-20 20:43:26 +0100831 # Get sigles to extract
832 my $prefix = set_sigle($archive);
833
Akron941c1a62016-02-23 17:41:41 +0100834 print "Start processing ...\n";
835 $t = Benchmark->new;
836 my @dirs = $archive->list_texts;
837 $count = scalar @dirs;
838
839 ARCHIVE_LOOP:
840 for (my $i = 0; $i < $count; $i++) {
841
842 # Split path information
843 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
844
Akrone1dbc382016-07-08 22:24:52 +0200845 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200846 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200847 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200848 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200849 catfile($corpus, $doc, $text)
850 . '.json' . ($gzip ? '.gz' : '')
851 )
Akrone1dbc382016-07-08 22:24:52 +0200852 );
Akron941c1a62016-02-23 17:41:41 +0100853
854 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200855 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100856
Akron4c0cf312016-10-15 16:42:09 +0200857 # Create temporary file
858 $temp = File::Temp->newdir;
859
Akronbdf434a2016-10-24 17:42:07 +0200860 # TODO: Check if $filename exist at the beginning,
861 # because extraction can be horrible slow!
862
Akron941c1a62016-02-23 17:41:41 +0100863 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100864 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100865
Akron7d4cdd82016-08-17 21:39:45 +0200866 # Create corpus directory
867 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100868
Akron7d4cdd82016-08-17 21:39:45 +0200869 # Temporary directory
870 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100871
Akron7d4cdd82016-08-17 21:39:45 +0200872 # Write file
Akron13d56622016-10-31 14:54:49 +0100873 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200874
Akron4c0cf312016-10-15 16:42:09 +0200875 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100876 $pool->finish(
877 0,
Akronda3097e2017-04-23 19:53:57 +0200878 [
879 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
880 $temp,
881 $filename
882 ]
Akron13d56622016-10-31 14:54:49 +0100883 );
Akron7d4cdd82016-08-17 21:39:45 +0200884 }
885 else {
Akron4c0cf312016-10-15 16:42:09 +0200886 # Delete temporary file
887 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200888 };
Akron941c1a62016-02-23 17:41:41 +0100889 }
Akron7d4cdd82016-08-17 21:39:45 +0200890
891 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100892 else {
Akron4c0cf312016-10-15 16:42:09 +0200893 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100894 };
895 };
896 }
897
898 else {
899 print "Input is neither a directory nor an archive.\n\n";
900 };
901
902 $pool->wait_all_children;
903
Akron11c80302016-03-18 19:44:43 +0100904 # Delete cache file
905 unlink($cache_file) if $cache_delete;
906
Akronda3097e2017-04-23 19:53:57 +0200907 # Close tar filehandle
908 if ($to_tar && $tar_fh) {
909 $tar_archive->finish;
910 $tar_fh->close;
911 print "Wrote to tar archive.\n";
912 };
913
Akron63f20d42017-04-10 23:40:29 +0200914 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100915 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200916};
Akron941c1a62016-02-23 17:41:41 +0100917
Nils Diewald2db9ad02013-10-29 19:26:43 +0000918
Akron31a08cb2019-02-20 20:43:26 +0100919# For an archive, this will create the list
920# of all sigles to process
921sub set_sigle {
922 my $archive = shift;
923
924 my $prefix = 1;
925 my @dirs = ();
926
927 # No sigles given
928 unless (@sigle) {
929
930 # Get files
931 foreach ($archive->list_texts) {
932
933 push @dirs, $_;
934
935 # Split path information
936 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
937
938 # TODO: Make this OS independent
939 push @sigle, join '/', $corpus, $doc, $text;
940 };
941 }
942
943 # Check sigle for doc sigles
944 else {
945 my @new_sigle;
946
947 my $prefix_check = 0;
948
949 # Iterate over all sigle
950 foreach (@sigle) {
951
952 # Sigle is a doc sigle
953 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
954
955 print "$_ ...";
956 # Check if a prefix is needed
957 unless ($prefix_check) {
958
959 if ($prefix = $archive->check_prefix) {
960 print " with prefix ...";
961 };
962 $prefix_check = 1;
963 };
964
965 print "\n";
966
Akron31a08cb2019-02-20 20:43:26 +0100967 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100968 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
969 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100970 );
971 print "extracted.\n";
972 }
973
974 # Sigle is a text sigle
975 else {
976 push @new_sigle, $_;
977
978 unless ($prefix_check) {
979
980 if ($prefix = $archive->check_prefix) {
981 print " with prefix ...";
982 };
983 $prefix_check = 1;
984 };
985 };
986 };
987 @sigle = @new_sigle;
988 };
989
990 return $prefix;
991};
992
993
Akron63f20d42017-04-10 23:40:29 +0200994# Cleanup temporary extraction directory
995if ($extract_dir) {
996 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +0200997 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +0200998};
999
1000
1001print "\n";
1002
Nils Diewald2db9ad02013-10-29 19:26:43 +00001003__END__
Akron941c1a62016-02-23 17:41:41 +01001004
1005=pod
1006
1007=encoding utf8
1008
1009=head1 NAME
1010
Akron42f48c12020-02-14 13:08:13 +01001011korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001012
1013
1014=head1 SYNOPSIS
1015
Akrona76d8352016-10-27 16:27:32 +02001016 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001017
Akron2fd402b2016-10-27 21:26:48 +02001018
Akron941c1a62016-02-23 17:41:41 +01001019=head1 DESCRIPTION
1020
1021L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1022compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001023The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001024
1025
1026=head1 INSTALLATION
1027
1028The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1029
Akronaf386982016-10-12 00:33:25 +02001030 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001031
Akronc13a1702016-03-15 19:33:14 +01001032In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001033be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001034Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akron0b04b312020-10-30 17:39:18 +01001035Optional support for L<Sys::Info> to calculate available cores.
Akrona93d51b2016-10-24 20:27:48 +02001036In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001037
1038=head1 ARGUMENTS
1039
Akrona76d8352016-10-27 16:27:32 +02001040 $ korapxml2krill -z --input <directory> --output <filename>
1041
1042Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001043It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001044
Akron941c1a62016-02-23 17:41:41 +01001045=over 2
1046
1047=item B<archive>
1048
Akron081639e2017-04-21 19:01:39 +02001049 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001050
Akron2fd402b2016-10-27 21:26:48 +02001051Converts an archive of KorAP-XML documents. It expects a directory
1052(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001053
1054=item B<extract>
1055
Akrona76d8352016-10-27 16:27:32 +02001056 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1057
1058Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001059
Akron63f20d42017-04-10 23:40:29 +02001060=item B<serial>
1061
1062 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1063
1064Convert archives sequentially. The inputs are not merged but treated
1065as they are (so they may be premerged or globs).
1066the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001067are created based on the archive name. In case the C<--to-tar> flag is given,
1068the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001069
1070
Akron941c1a62016-02-23 17:41:41 +01001071=back
1072
1073
1074=head1 OPTIONS
1075
1076=over 2
1077
Akrona76d8352016-10-27 16:27:32 +02001078=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001079
Akrona76d8352016-10-27 16:27:32 +02001080Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001081
Akron7606afa2016-10-25 16:23:49 +02001082Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001083document, while C<archive> expects a KorAP-XML corpus folder or a zip
1084file to batch process multiple files.
1085C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001086
Akrona76d8352016-10-27 16:27:32 +02001087C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001088that the first archive listed contains all primary data files
1089and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001090
Akron7606afa2016-10-25 16:23:49 +02001091 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001092
Akron821db3d2017-04-06 21:19:31 +02001093Input may also be defined using BSD glob wildcards.
1094
1095 -i 'file/news*.zip'
1096
1097The extended input array will be sorted in length order, so the shortest
1098path needs to contain all primary data files and all meta data files.
1099
Akron0c3e3752016-06-28 15:55:53 +02001100(The directory structure follows the base directory format,
1101that may include a C<.> root folder.
1102In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001103need to be passed with a hash sign in front of the archive's name.
1104This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001105
Akron7606afa2016-10-25 16:23:49 +02001106To support zip files, a version of C<unzip> needs to be installed that is
1107compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001108
Akron7606afa2016-10-25 16:23:49 +02001109B<The root folder switch using the hash sign is experimental and
1110may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001111
Akronf73ffb62018-06-27 12:13:59 +02001112
Akron63f20d42017-04-10 23:40:29 +02001113=item B<--input-base|-ib> <directory>
1114
1115The base directory for inputs.
1116
1117
Akron941c1a62016-02-23 17:41:41 +01001118=item B<--output|-o> <directory|file>
1119
1120Output folder for archive processing or
1121document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001122writes to C<STDOUT> by default
1123(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001124
1125=item B<--overwrite|-w>
1126
1127Overwrite files that already exist.
1128
Akronf73ffb62018-06-27 12:13:59 +02001129
Akron3741f8b2016-12-21 19:55:21 +01001130=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001131
1132Define the default tokenization by specifying
1133the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001134of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001135This will directly take the file instead of running
1136the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001137
Akron3741f8b2016-12-21 19:55:21 +01001138
1139=item B<--base-sentences|-bs> <foundry>#<layer>
1140
1141Define the layer for base sentences.
1142If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001143Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1144layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001145
1146 Defaults to unset.
1147
1148
1149=item B<--base-paragraphs|-bp> <foundry>#<layer>
1150
1151Define the layer for base paragraphs.
1152If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001153Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1154layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001155
1156 Defaults to unset.
1157
1158
Akron41ac10b2017-02-08 22:47:25 +01001159=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1160
1161Define the layer for base pagebreaks.
1162Currently C<DeReKo#Structure> is the only layer supported.
1163
1164 Defaults to unset.
1165
1166
Akron941c1a62016-02-23 17:41:41 +01001167=item B<--skip|-s> <foundry>[#<layer>]
1168
Akronf7ad89e2016-03-16 18:22:47 +01001169Skip specific annotations by specifying the foundry
1170(and optionally the layer with a C<#>-prefix),
1171e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001172Can be set multiple times.
1173
Akronf73ffb62018-06-27 12:13:59 +02001174
Akronc13a1702016-03-15 19:33:14 +01001175=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001176
Akronf7ad89e2016-03-16 18:22:47 +01001177Convert specific annotations by specifying the foundry
1178(and optionally the layer with a C<#>-prefix),
1179e.g. C<Mate> or C<Mate#Morpho>.
1180Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001181
Akronf73ffb62018-06-27 12:13:59 +02001182
Akroned9baf02019-01-22 17:03:25 +01001183=item B<--non-word-tokens|-nwt>
1184
1185Tokenize non-word tokens like word tokens (defined as matching
1186C</[\d\w]/>). Useful to treat punctuations as tokens.
1187
1188 Defaults to unset.
1189
Akronf1849aa2019-12-16 23:35:33 +01001190
1191=item B<--non-verbal-tokens|-nvt>
1192
1193Tokenize non-verbal tokens marked as in the primary data as
1194the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1195
1196 Defaults to unset.
1197
1198
Akron941c1a62016-02-23 17:41:41 +01001199=item B<--jobs|-j>
1200
1201Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001202for archive processing.
Akron11c80302016-03-18 19:44:43 +01001203Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001204
1205If C<sequential-extraction> is not set to false, this will
1206also apply to extraction.
1207
Akronc11f7982017-02-21 21:20:14 +01001208Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001209times the number of available cores, in case L<Sys::Info>
1210is available.
Akronf7ad89e2016-03-16 18:22:47 +01001211This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001212
Akronf73ffb62018-06-27 12:13:59 +02001213
Akron263274c2019-02-07 09:48:30 +01001214=item B<--koral|-k>
1215
1216Version of the output format. Supported versions are:
1217C<0> for legacy serialization, C<0.03> for serialization
1218with metadata fields as key-values on the root object,
1219C<0.4> for serialization with metadata fields as a list
1220of C<"@type":"koral:field"> objects.
1221
1222Currently defaults to C<0.03>.
1223
1224
Akron9ec88872017-04-12 16:29:06 +02001225=item B<--sequential-extraction|-se>
1226
1227Flag to indicate, if the C<jobs> value also applies to extraction.
1228Some systems may have problems with extracting multiple archives
1229to the same folder at the same time.
1230Can be flagged using C<--no-sequential-extraction> as well.
1231Defaults to C<false>.
1232
Akronf73ffb62018-06-27 12:13:59 +02001233
Akron35db6e32016-03-17 22:42:22 +01001234=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001235
Akron35db6e32016-03-17 22:42:22 +01001236Define the metadata parser to use. Defaults to C<I5>.
1237Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1238This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001239
Akronf73ffb62018-06-27 12:13:59 +02001240
Akron941c1a62016-02-23 17:41:41 +01001241=item B<--gzip|-z>
1242
Akronf7ad89e2016-03-16 18:22:47 +01001243Compress the output.
1244Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001245
Akronf73ffb62018-06-27 12:13:59 +02001246
Akron11c80302016-03-18 19:44:43 +01001247=item B<--cache|-c>
1248
1249File to mmap a cache (using L<Cache::FastMmap>).
1250Defaults to C<korapxml2krill.cache> in the calling directory.
1251
Akronf73ffb62018-06-27 12:13:59 +02001252
Akron11c80302016-03-18 19:44:43 +01001253=item B<--cache-size|-cs>
1254
1255Size of the cache. Defaults to C<50m>.
1256
Akronf73ffb62018-06-27 12:13:59 +02001257
Akron11c80302016-03-18 19:44:43 +01001258=item B<--cache-init|-ci>
1259
1260Initialize cache file.
1261Can be flagged using C<--no-cache-init> as well.
1262Defaults to C<true>.
1263
Akronf73ffb62018-06-27 12:13:59 +02001264
Akron11c80302016-03-18 19:44:43 +01001265=item B<--cache-delete|-cd>
1266
1267Delete cache file after processing.
1268Can be flagged using C<--no-cache-delete> as well.
1269Defaults to C<true>.
1270
Akronf73ffb62018-06-27 12:13:59 +02001271
Akron636aa112017-04-07 18:48:56 +02001272=item B<--config|-cfg>
1273
1274Configure the parameters of your call in a file
1275of key-value pairs with whitespace separator
1276
1277 overwrite 1
1278 token DeReKo#Structure
1279 ...
1280
1281Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001282C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001283C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001284C<output>, C<koral>,
1285C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001286C<base-sentences>, C<base-paragraphs>,
1287C<base-pagebreaks>,
1288C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001289(semicolon separated), C<anno> (semicolon separated).
1290
Akronf73ffb62018-06-27 12:13:59 +02001291Configuration parameters will always be overwritten by
1292passed parameters.
1293
1294
Akron81500102017-04-07 20:45:44 +02001295=item B<--temporary-extract|-te>
1296
1297Only valid for the C<archive> command.
1298
1299This will first extract all files into a
1300directory and then will archive.
1301If the directory is given as C<:temp:>,
1302a temporary directory is used.
1303This is especially useful to avoid
1304massive unzipping and potential
1305network latency.
Akron636aa112017-04-07 18:48:56 +02001306
Akronf73ffb62018-06-27 12:13:59 +02001307
Akronc93a0802019-07-11 15:48:34 +02001308=item B<--to-tar>
1309
1310Only valid for the C<archive> command.
1311
1312Writes the output into a tar archive.
1313
1314
Akrone10ad322016-02-27 10:54:26 +01001315=item B<--sigle|-sg>
1316
Akron20807582016-10-26 17:11:34 +02001317Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001318Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001319I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001320Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001321In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001322On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001323
Akronf73ffb62018-06-27 12:13:59 +02001324
Akron941c1a62016-02-23 17:41:41 +01001325=item B<--log|-l>
1326
Akronb9c33812020-10-21 16:19:35 +02001327The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001328
Akronf73ffb62018-06-27 12:13:59 +02001329
Akron941c1a62016-02-23 17:41:41 +01001330=item B<--help|-h>
1331
Akron42f48c12020-02-14 13:08:13 +01001332Print help information.
Akron941c1a62016-02-23 17:41:41 +01001333
Akronf73ffb62018-06-27 12:13:59 +02001334
Akron941c1a62016-02-23 17:41:41 +01001335=item B<--version|-v>
1336
1337Print version information.
1338
1339=back
1340
Akronf73ffb62018-06-27 12:13:59 +02001341
Akronc13a1702016-03-15 19:33:14 +01001342=head1 ANNOTATION SUPPORT
1343
1344L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1345developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1346The base foundry with paragraphs, sentences, and the text element are mandatory for
1347L<Krill|https://github.com/KorAP/Krill>.
1348
Akron821db3d2017-04-06 21:19:31 +02001349 Base
1350 #Paragraphs
1351 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001352
Akron821db3d2017-04-06 21:19:31 +02001353 Connexor
1354 #Morpho
1355 #Phrase
1356 #Sentences
1357 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001358
Akron821db3d2017-04-06 21:19:31 +02001359 CoreNLP
1360 #Constituency
1361 #Morpho
1362 #NamedEntities
1363 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001364
Akronce125b62017-06-19 11:54:36 +02001365 CMC
1366 #Morpho
1367
Akron821db3d2017-04-06 21:19:31 +02001368 DeReKo
1369 #Structure
Akronc13a1702016-03-15 19:33:14 +01001370
Akron57510c12019-01-04 14:58:53 +01001371 DGD
1372 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001373 #Structure
Akron57510c12019-01-04 14:58:53 +01001374
Akron821db3d2017-04-06 21:19:31 +02001375 DRuKoLa
1376 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001377
Akron821db3d2017-04-06 21:19:31 +02001378 Glemm
1379 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001380
Akronabb36902021-10-11 15:51:06 +02001381 Gingko
1382 #Morpho
1383
Akronea1aed52018-07-19 14:43:34 +02001384 HNC
1385 #Morpho
1386
Akron4c679192018-01-16 17:41:49 +01001387 LWC
1388 #Dependency
1389
Akron821db3d2017-04-06 21:19:31 +02001390 Malt
1391 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001392
Akron821db3d2017-04-06 21:19:31 +02001393 MarMoT
1394 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001395
Akron821db3d2017-04-06 21:19:31 +02001396 Mate
1397 #Dependency
1398 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001399
Akron821db3d2017-04-06 21:19:31 +02001400 MDParser
1401 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001402
Akron821db3d2017-04-06 21:19:31 +02001403 OpenNLP
1404 #Morpho
1405 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001406
Akron07e24772020-04-23 14:00:54 +02001407 RWK
1408 #Morpho
1409 #Structure
1410
Akron821db3d2017-04-06 21:19:31 +02001411 Sgbr
1412 #Lemma
1413 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001414
Akron7d5e6382019-08-08 16:36:27 +02001415 Talismane
1416 #Dependency
1417 #Morpho
1418
Akron821db3d2017-04-06 21:19:31 +02001419 TreeTagger
1420 #Morpho
1421 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001422
Akron821db3d2017-04-06 21:19:31 +02001423 XIP
1424 #Constituency
1425 #Morpho
1426 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001427
Akronc13a1702016-03-15 19:33:14 +01001428
1429More importers are in preparation.
1430New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1431See the built-in annotation importers as examples.
1432
Akronf73ffb62018-06-27 12:13:59 +02001433
Akron41e6c8b2021-10-14 20:22:18 +02001434=head1 METADATA SUPPORT
1435
1436L<KorAP::XML::Krill> has built-in importer for some meta data variants
1437developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1438
1439=over 2
1440
1441=item I5 - Meta data for all I5 files
1442
1443=item Sgbr - Meta data from the Schreibgebrauch project
1444
1445=item Gingko - Meta data from the Gingko project in addition to I5
1446
1447=back
1448
1449More importers are in preparation.
1450New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1451See the built-in meta data importers as examples.
1452
1453
Akron8f69d632020-01-15 16:58:11 +01001454=head1 About KorAP-XML
1455
1456KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1457data model (Bański et al. 2013), where text data are stored physically
1458separated from their interpretations (i.e. annotations).
1459A text document in KorAP-XML therefore consists of several files
1460containing primary data, metadata and annotations.
1461
1462The structure of a single KorAP-XML document can be as follows:
1463
1464 - data.xml
1465 - header.xml
1466 + base
1467 - tokens.xml
1468 - ...
1469 + struct
1470 - structure.xml
1471 - ...
1472 + corenlp
1473 - morpho.xml
1474 - constituency.xml
1475 - ...
1476 + tree_tagger
1477 - morpho.xml
1478 - ...
1479 - ...
1480
1481The C<data.xml> contains the primary data, the C<header.xml> contains
1482the metadata, and the annotation layers are stored in subfolders
1483like C<base>, C<struct> or C<corenlp>
1484(so-called "foundries"; Bański et al. 2013).
1485
1486Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001487(Lüngen and Sperberg-McQueen 2012). See the documentation in
1488L<KorAP::XML::Meta::I5> for translatable fields.
1489
1490Annotations correspond to a variant of the TEI-P5 feature structures
1491(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001492Annotation feature structures refer to character sequences of the primary text
1493inside the C<text> element of the C<data.xml>.
1494A single annotation containing the lemma of a token can have the following structure:
1495
1496 <span from="0" to="3">
1497 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1498 <f name="lex">
1499 <fs>
1500 <f name="lemma">zum</f>
1501 </fs>
1502 </f>
1503 </fs>
1504 </span>
1505
1506The C<from> and C<to> attributes are refering to the character span
1507in the primary text.
1508Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1509the structure may vary. See L<KorAP::XML::Annotation::*> for various
1510annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001511
1512Multiple KorAP-XML documents are organized on three levels following
1513the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1514corpus E<gt> document E<gt> text. On each level metadata information
1515can be stored, that C<korapxml2krill> will merge to a single metadata
1516object per text. A corpus is therefore structured as follows:
1517
1518 + <corpus>
1519 - header.xml
1520 + <document>
1521 - header.xml
1522 + <text>
1523 - data.xml
1524 - header.xml
1525 - ...
1526 - ...
1527
1528A single text can be identified by the concatenation of
1529the corpus identifier, the document identifier and the text identifier.
1530This identifier is called the text sigle
1531(e.g. a text with the identifier C<18486> in the document C<060> in the
1532corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1533
1534These corpora are often stored in zip files, with which C<korapxml2krill>
1535can deal with. Corpora may also be split in multiple zip archives
1536(e.g. one zip file per foundry), which is also supported (see C<--input>).
1537
1538Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1539in form of a test suite.
1540The resulting JSON format merges all annotation layers
1541based on a single token stream.
1542
1543=head2 References
1544
1545Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1546KorAP data model: first approximation, December.
1547
1548Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1549"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1550Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1551L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1552
1553Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1554"Robust corpus architecture: a new look at virtual collections and data access",
1555Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1556L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1557
1558Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1559Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1560"Towards an international standard on featurestructure representation",
1561Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1562pp. 373-376.
1563L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1564
1565Harald Lüngen and C. M. Sperberg-McQueen (2012):
1566"A TEI P5 Document Grammar for the IDS Text Model",
1567Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1568L<PDF|https://journals.openedition.org/jtei/pdf/508>
1569
1570TEI Consortium, eds:
1571"Feature Structures",
1572Guidelines for Electronic Text Encoding and Interchange.
1573L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1574
Akron941c1a62016-02-23 17:41:41 +01001575=head1 AVAILABILITY
1576
1577 https://github.com/KorAP/KorAP-XML-Krill
1578
1579
1580=head1 COPYRIGHT AND LICENSE
1581
Akron6882d7d2021-02-08 09:43:57 +01001582Copyright (C) 2015-2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001583
Akron6882d7d2021-02-08 09:43:57 +01001584Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001585
Akrona76d8352016-10-27 16:27:32 +02001586Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001587
Akron6882d7d2021-02-08 09:43:57 +01001588L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001589Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001590L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001591member of the
Akronf1849aa2019-12-16 23:35:33 +01001592L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001593
1594This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001595L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001596
1597=cut