blob: d09786c6fbcafa2593c2cf6155feb4a4deb0a4d1 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010019use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020020use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020021use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010022use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020023use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020024use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020025use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020026use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020027use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020029use IO::File;
30use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akron941c1a62016-02-23 17:41:41 +0100161# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100162
Akron9a2545e2022-01-16 15:15:50 +0100163our $LAST_CHANGE = '2022/01/17';
Akron941c1a62016-02-23 17:41:41 +0100164our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100165our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100166our $VERSION_MSG = <<"VERSION";
167Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
168VERSION
169
Akron941c1a62016-02-23 17:41:41 +0100170# Parse comand
171my $cmd;
172our @ARGV;
173if ($ARGV[0] && index($ARGV[0], '-') != 0) {
174 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100175};
Akron63f20d42017-04-10 23:40:29 +0200176my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100177
Akron5f51d422016-08-16 16:26:43 +0200178my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200179
180# Configuration hash
181my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100182
Akron941c1a62016-02-23 17:41:41 +0100183# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000184GetOptions(
Akron08385f62016-03-22 20:37:04 +0100185 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200186 'input-base|ib=s' => \($cfg{input_base}),
187 'output|o=s' => \($cfg{output}),
188 'overwrite|w' => \($cfg{overwrite}),
189 'meta|m=s' => \($cfg{meta}),
190 'token|t=s' => \($cfg{token}),
191 'base-sentences|bs=s' => \($cfg{base_sentences}),
192 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
193 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
194 'gzip|z' => \($cfg{gzip}),
195 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100196 'skip|s=s' => \@skip,
197 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200198 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200199 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200200 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200201 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200202 'primary|p!' => sub {
203 warn 'Primary flag no longer supported!';
204 },
Akron6aed0562020-08-07 16:46:00 +0200205 'pretty|y' => sub {
206 warn 'Pretty flag no longer supported!';
207 },
Akronf8df2162020-08-07 15:03:39 +0200208 'jobs|j=i' => \($cfg{jobs}),
209 'koral|k=f' => \($cfg{koral}),
210 'to-tar' => \($cfg{to_tar}),
211 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
212 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
213 'sequential-extraction|se' => \($cfg{sequential_extraction}),
214 'cache-size|cs=s' => \($cfg{cache_size}),
215 'cache-delete|cd!' => \($cfg{cache_delete}),
216 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100217 'help|h' => sub {
218 pod2usage(
219 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200220 -verbose => 99,
221 -msg => $VERSION_MSG,
222 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100223 );
224 },
225 'version|v' => sub {
226 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200227 -verbose => 0,
228 -msg => $VERSION_MSG,
229 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100230 )
231 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000232);
233
Akrone512b7c2020-08-07 16:16:12 +0200234my %ERROR_HASH = (
235 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
236 -verbose => 99,
237 -msg => $VERSION_MSG,
238 -output => '-',
239 -exit => 1
240);
Akron63f20d42017-04-10 23:40:29 +0200241
Akronf8df2162020-08-07 15:03:39 +0200242# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200243if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200244 my %config;
245
Akronf8df2162020-08-07 15:03:39 +0200246 print "Reading config from $cfg_file\n";
247
Akron636aa112017-04-07 18:48:56 +0200248 Config::Simple->import_from($cfg_file, \%config);
249
Akronf8df2162020-08-07 15:03:39 +0200250 foreach (qw!output cache-size input-base token overwrite
251 meta base-sentences base-paragraphs base-pagebreaks
252 gzip to-tar log cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100253 non-verbal-tokens sequential-extraction
254 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200255 koral extract-dir jobs!) {
256 my $underlined = $_ =~ tr/-/_/r;
257 if (!defined($cfg{$underlined}) && defined $config{$_}) {
258 $cfg{$underlined} = $config{$_};
259 };
Akron636aa112017-04-07 18:48:56 +0200260 };
261
262 # Skip
263 if (!scalar(@skip) && defined $config{'skip'}) {
264 @skip = split /\s*;\s*/, $config{'skip'} ;
265 };
266
267 # Sigle
268 if (!scalar(@sigle) && defined $config{'sigle'}) {
269 @sigle = split /\s*;\s*/, $config{'sigle'} ;
270 };
271
272 # Anno
273 if (!scalar(@anno) && defined $config{'anno'}) {
274 @anno = split /\s*;\s*/, $config{'anno'} ;
275 };
276};
277
Akronf8df2162020-08-07 15:03:39 +0200278# Init variables and set default values
279my $output = $cfg{output};
280my $input_base = $cfg{input_base};
281my $gzip = $cfg{gzip};
282my $to_tar = $cfg{to_tar};
283my $extract_dir = $cfg{extract_dir};
284my $token_base = $cfg{token} // 'OpenNLP#tokens';
285my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
286my $jobs = $cfg{jobs} // 0;
287my $cache_delete = $cfg{cache_delete} // 1;
288my $base_sentences = lc($cfg{base_sentences} // '');
289my $base_paragraphs = lc($cfg{base_paragraphs} // '');
290my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
291my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200292
Akronf8df2162020-08-07 15:03:39 +0200293# Get tokenization basis
294my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200295
Akronf8df2162020-08-07 15:03:39 +0200296# Remove file extension
297$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100298
Akronf8df2162020-08-07 15:03:39 +0200299# Convert sigle to path construct
300s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
301
302my %skip;
303$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200304
Akronb9c33812020-10-21 16:19:35 +0200305Log::Any::Adapter->set(
306 'Stderr', log_level => uc($cfg{log} // 'ERROR')
307);
Akron63f20d42017-04-10 23:40:29 +0200308
Akron84b53ad2022-01-14 12:39:15 +0100309# Start log slimming
310if ($cmd && $cmd eq 'slimlog') {
311 require KorAP::XML::Log::Slim;
312
313 my $log_file = shift @ARGV;
314
315 if (-e $log_file) {
316
317 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
318
319 # Run log filter
320 $slimmer->slim_to;
321 }
322
323 else {
324 warn "Log file can't be found";
325 exit(1);
326 };
327
328 exit;
329};
330
331
Akronf8df2162020-08-07 15:03:39 +0200332if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
333 $log->error("Directory '$output' does not exist.");
334 exit 1;
335};
Akron63f20d42017-04-10 23:40:29 +0200336
Akron941c1a62016-02-23 17:41:41 +0100337# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100338pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000339
Akrone1dbc382016-07-08 22:24:52 +0200340# Gzip has no effect, if no output is given
341pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000342
Akron63f20d42017-04-10 23:40:29 +0200343# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200344if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200345
Akron63f20d42017-04-10 23:40:29 +0200346 # Remove all inputs
347 my $remove_next = 0;
348 @keep_argv = @{c(@keep_argv)->grep(
349 sub {
350 # Input flag
351 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
352 $remove_next = 1;
353 return 0;
354 }
355
356 # input value
357 elsif ($remove_next) {
358 $remove_next = 0;
359 return 0;
360 };
361
362 # Pass parameter
363 return 1;
364 }
365 )->to_array};
366
367
368 # Iterate over all inputs
369 foreach (@input) {
370
Akron081639e2017-04-21 19:01:39 +0200371 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200372 my $new_out = catdir($output, get_file_name_from_glob($_));
373
Akron486f9ab2017-04-22 23:25:19 +0200374 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200375 unless ($to_tar) {
376 if (make_path($new_out) == 0 && !-d $new_out) {
377 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200378 exit 1;
Akron081639e2017-04-21 19:01:39 +0200379 };
Akron63f20d42017-04-10 23:40:29 +0200380 };
381
382 # Create archive command
383 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
384 print "Start serial processing of $_ to $new_out\n";
385
386 # Start archiving
387 system @archive_cmd;
388 };
389
Akron3abc03e2017-06-29 16:23:35 +0200390 exit;
Akron63f20d42017-04-10 23:40:29 +0200391};
392
Akron5c602cb2020-08-07 17:00:52 +0200393# Define supported (and preinstalled) transformation modules
394my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100395push(@layers, ['Base', 'Sentences']) unless $base_sentences;
396push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200397
398# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200399push(@layers, ['Connexor', 'Morpho'],
400 ['Connexor', 'Syntax'],
401 ['Connexor', 'Phrase'],
402 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200403
404# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200405push(@layers,
406 ['CoreNLP', 'NamedEntities'],
407 ['CoreNLP', 'Sentences'],
408 ['CoreNLP', 'Morpho'],
409 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200410
Akronce125b62017-06-19 11:54:36 +0200411# CMC
412push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100413
Akrone1dbc382016-07-08 22:24:52 +0200414# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100415my @dereko_attr = ();
416if ($base_sentences eq 'dereko#structure') {
417 push @dereko_attr, 'sentences';
418};
419if ($base_paragraphs eq 'dereko#structure') {
420 push @dereko_attr, 'paragraphs';
421};
Akron636bd9c2017-02-09 17:13:00 +0100422
Akron41ac10b2017-02-08 22:47:25 +0100423if ($base_pagebreaks eq 'dereko#structure') {
424 push @dereko_attr, 'pagebreaks';
425};
426
427if ($dereko_attr[0]) {
428 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100429}
430else {
431 push(@layers, ['DeReKo', 'Structure']);
432};
Akrone1dbc382016-07-08 22:24:52 +0200433
Akron57510c12019-01-04 14:58:53 +0100434# DGD
435push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100436if ($base_sentences eq 'dgd#structure') {
437 push(@layers, ['DGD', 'Structure', 'base-sentence']);
438}
Akron57510c12019-01-04 14:58:53 +0100439
440# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200441push(@layers,
442 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100443
Akronabb36902021-10-11 15:51:06 +0200444# Gingko
445push(@layers,
446 ['Gingko', 'Morpho']);
447
Akrone1dbc382016-07-08 22:24:52 +0200448# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200449push(@layers,
450 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200451
Akronea1aed52018-07-19 14:43:34 +0200452# HNC
Akron5c602cb2020-08-07 17:00:52 +0200453push(@layers,
454 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200455
Akron4c679192018-01-16 17:41:49 +0100456# LWC
Akron5c602cb2020-08-07 17:00:52 +0200457push(@layers,
458 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100459
Akrone1dbc382016-07-08 22:24:52 +0200460# Malt
Akron5c602cb2020-08-07 17:00:52 +0200461push(@layers,
462 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200463
Akron57510c12019-01-04 14:58:53 +0100464# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200465push(@layers,
466 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200467
468# Mate
Akron5c602cb2020-08-07 17:00:52 +0200469push(@layers,
470 ['Mate', 'Morpho'],
471 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200472
Akron57510c12019-01-04 14:58:53 +0100473# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200474push(@layers,
475 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100476
Akrone1dbc382016-07-08 22:24:52 +0200477# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200478push(@layers,
479 ['OpenNLP', 'Morpho'],
480 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200481
Akron07e24772020-04-23 14:00:54 +0200482# Redewiedergabe
483push(@layers, ['RWK', 'Morpho']);
484if ($base_sentences eq 'rwk#structure') {
485 push(@layers, ['RWK', 'Structure']);
486};
487
Akrone1dbc382016-07-08 22:24:52 +0200488# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200489push(@layers,
490 ['Sgbr', 'Lemma'],
491 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200492
Akron7d5e6382019-08-08 16:36:27 +0200493# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200494push(@layers,
495 ['Talismane', 'Dependency'],
496 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200497
Akrone1dbc382016-07-08 22:24:52 +0200498# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200499push(@layers,
500 ['TreeTagger', 'Morpho'],
501 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200502
503# XIP
Akron5c602cb2020-08-07 17:00:52 +0200504push(@layers,
505 ['XIP', 'Morpho'],
506 ['XIP', 'Constituency'],
507 ['XIP', 'Sentences'],
508 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200509
Akron4fa37c32017-01-20 14:43:10 +0100510
Akrone1dbc382016-07-08 22:24:52 +0200511# Check filters
512my @filtered_anno;
513if ($skip{'#all'}) {
514 foreach (@anno) {
515 push @filtered_anno, [ split('#', $_) ];
516 };
517}
518
519# Add all annotations that are not skipped
520else {
521 # Add to index file - respect skipping
522 foreach my $info (@layers) {
523 # Skip if Foundry or Foundry#Layer should be skipped
524 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
525 push @filtered_anno, $info;
526 };
527 };
528};
529
Akrone1dbc382016-07-08 22:24:52 +0200530
531# TODO: This should not be initialized for batch
532my $cache = Cache::FastMmap->new(
533 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200534 cache_size => ($cfg{cache_size} // '50m'),
535 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200536);
537
Akron03b24db2016-08-16 20:54:32 +0200538# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200539my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200540 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200541 meta_type => $cfg{meta},
542 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200543 foundry => $token_base_foundry,
544 layer => $token_base_layer,
545 gzip => $gzip,
546 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200547 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100548 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200549 non_word_tokens => ($cfg{non_word_tokens} // 0),
550 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200551);
552
Akrone512b7c2020-08-07 16:16:12 +0200553
554# Auto adjust jobs
555if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100556 my $cores = 1;
557 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
558 $cores = Sys::Info->new->device('CPU')->count;
559 }
560 else {
561 $log->warn("Unable to determine number of cores");
562 };
563
Akrone512b7c2020-08-07 16:16:12 +0200564 $jobs = ceil(5 * $cores);
565 $log->info("Run using $jobs jobs on $cores cores");
566};
567
568
Akron63f20d42017-04-10 23:40:29 +0200569# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200570if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200571
Akron821db3d2017-04-06 21:19:31 +0200572 my @new_input = ();
573
574 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200575 foreach my $wild_card (@input) {
576
577 # Prefix with input root
578 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
579
580 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200581 };
582
Akron63f20d42017-04-10 23:40:29 +0200583 # Sort files by length
584 @input = sort { length($a) <=> length($b) } @new_input;
585
586 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200587};
588
589
Akron941c1a62016-02-23 17:41:41 +0100590# Process a single file
591unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100592 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000593
Akron941c1a62016-02-23 17:41:41 +0100594 BEGIN {
595 $main::TIME = Benchmark->new;
596 $main::LAST_STOP = Benchmark->new;
597 };
598
599 sub stop_time {
600 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200601 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100602 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200603 timestr(timediff($new, $main::LAST_STOP)) .
604 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
605 );
Akron941c1a62016-02-23 17:41:41 +0100606 $main::LAST_STOP = $new;
607 };
608
609 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200610 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100611
Akron7d4cdd82016-08-17 21:39:45 +0200612 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200613 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100614
Akron11c80302016-03-18 19:44:43 +0100615 # Delete cache file
616 unlink($cache_file) if $cache_delete;
617
Akron5f51d422016-08-16 16:26:43 +0200618 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200619 exit;
Akron81500102017-04-07 20:45:44 +0200620};
621
Nils Diewald59094f22014-11-05 18:20:50 +0000622
Akrone10ad322016-02-27 10:54:26 +0100623# Extract XML files
Akron81500102017-04-07 20:45:44 +0200624if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100625
Akrond5643ad2017-07-04 20:27:13 +0200626 # Output is required
627 pod2usage(%ERROR_HASH) unless $output;
628
Akron7d4cdd82016-08-17 21:39:45 +0200629 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200630 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100631
Akron7d4cdd82016-08-17 21:39:45 +0200632 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100633 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200634 $log->error("Unzip is not installed or incompatible.");
635 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100636 };
637
Akronb0c88db2016-06-29 16:33:18 +0200638 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200639 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200640
Akron31a08cb2019-02-20 20:43:26 +0100641 # Will set @sigle
642 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200643
Akrone10ad322016-02-27 10:54:26 +0100644 # Iterate over all given sigles and extract
645 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100646
Akron2812ba22016-10-28 21:55:59 +0200647 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200648
Akron03b24db2016-08-16 20:54:32 +0200649 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200650 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100651
Akron955b75b2019-02-21 14:28:41 +0100652 # TODO:
653 # - prefix???
654 $archive->extract_sigle([$_], $output, $jobs)
655 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200656 );
Akrone10ad322016-02-27 10:54:26 +0100657 print "extracted.\n";
658 };
Akronb0c88db2016-06-29 16:33:18 +0200659 }
Akron7d4cdd82016-08-17 21:39:45 +0200660
661 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200662 else {
663 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200664 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100665 };
666}
667
Akron81500102017-04-07 20:45:44 +0200668
Akron941c1a62016-02-23 17:41:41 +0100669# Process an archive
670elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000671
Akron81500102017-04-07 20:45:44 +0200672 my $archive_output;
673
674 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100675 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200676
677 # Create new archive object
678 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
679
680 # Check zip capabilities
681 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200682 $log->error("Unzip is not installed or incompatible.");
683 exit 1;
Akron81500102017-04-07 20:45:44 +0200684 };
685
686 # Add further annotation archived
687 $archive->attach($_) foreach @input[1..$#input];
688
689 # Create a temporary directory
690 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200691 $extract_dir = tempdir(CLEANUP => 0);
692 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200693 };
694
Akron63f20d42017-04-10 23:40:29 +0200695 # Add some random extra to avoid clashes with multiple archives
696 $extract_dir = catdir($extract_dir, random_string('cccccc'));
697
Akron31a08cb2019-02-20 20:43:26 +0100698 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200699 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200700 @input = ($extract_dir);
701 }
702 else {
703 $log->error('Unable to extract from primary archive ' . $input[0] .
704 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200705 exit 1;
Akron81500102017-04-07 20:45:44 +0200706 };
707 }
708
709 # Can't create archive object
710 else {
711 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200712 exit 1;
Akron81500102017-04-07 20:45:44 +0200713 };
714 };
715
Akron7d4cdd82016-08-17 21:39:45 +0200716 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100717 my $pool = Parallel::ForkManager->new($jobs);
718
Akron7d4cdd82016-08-17 21:39:45 +0200719 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100720 my $iter = 1; # Current text in process
721
Akronda3097e2017-04-23 19:53:57 +0200722 my $tar_archive;
723 my $output_dir = $output;
724 my $tar_fh;
725
726 # Initialize tar archive
727 if ($to_tar) {
728 $tar_archive = Archive::Tar::Builder->new(
729 ignore_errors => 1
730 );
731
732 # Set output name
733 my $tar_file = $output;
734 unless ($tar_file =~ /\.tar$/) {
735 $tar_file .= '.tar';
736 };
737
738 # Initiate the tar file
739 print "Writing to file $tar_file\n";
740 $tar_fh = IO::File->new($tar_file, 'w');
741 $tar_fh->binmode(1);
742
743 # Set handle
744 $tar_archive->set_handle($tar_fh);
745
746 # Output to temporary directory
747 $output_dir = File::Temp->newdir;
748 };
749
Akron941c1a62016-02-23 17:41:41 +0100750 # Report on fork message
751 $pool->run_on_finish (
752 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200753 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100754 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200755
Akron08385f62016-03-22 20:37:04 +0100756 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200757 ($iter++) . "/$count]" .
758 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200759 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200760
761 if (!$code && $to_tar && $data->[2]) {
762 my $filename = $data->[2];
763
764 # Lock filehandle
765 if (flock($tar_fh, LOCK_EX)) {
766
Akron9a062ce2017-07-04 19:12:05 +0200767 my $clean_file = fileparse($filename);
768
Akronda3097e2017-04-23 19:53:57 +0200769 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200770 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200771 unlink $filename;
772
773 # Unlock filehandle
774 flock($tar_fh, LOCK_UN);
775 }
776 else {
777 $log->warn("Unable to add $filename to archive");
778 };
779 };
780
Akron4c0cf312016-10-15 16:42:09 +0200781 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100782 }
783 );
784
785 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200786 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100787 print "Reading data ...\n";
788
Akron7d4cdd82016-08-17 21:39:45 +0200789 # unless (Cache::FastMmap->new(
790 # share_file => $cache_file,
791 # cache_size => $cache_size,
792 # init_file => $cache_init
793 # )) {
794 # print "Unable to intialize cache '$cache_file'\n\n";
795 # exit(1);
796 # };
Akron11c80302016-03-18 19:44:43 +0100797
Akron486f9ab2017-04-22 23:25:19 +0200798
Akron941c1a62016-02-23 17:41:41 +0100799 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100800 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200801 # TODO:
802 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100803 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100804 my @dirs;
805 my $dir;
806
Akron7d4cdd82016-08-17 21:39:45 +0200807 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100808 while (1) {
809 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200810 push @dirs, $dir;
811 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100812 };
813 last unless $it->next;
814 };
815
816 print "Start processing ...\n";
817 $t = Benchmark->new;
818 $count = scalar @dirs;
819
820 DIRECTORY_LOOP:
821 for (my $i = 0; $i < $count; $i++) {
822
Akrone1dbc382016-07-08 22:24:52 +0200823 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200824 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200825 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200826 );
Akron941c1a62016-02-23 17:41:41 +0100827
828 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200829 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200830
Akron13d56622016-10-31 14:54:49 +0100831 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200832 $pool->finish(
833 0,
Akronda3097e2017-04-23 19:53:57 +0200834 [
835 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
836 undef,
837 $filename
838 ]
Akron486f9ab2017-04-22 23:25:19 +0200839 );
Akron3ec48972016-08-17 23:24:52 +0200840 }
841 else {
Akron4c0cf312016-10-15 16:42:09 +0200842 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200843 };
Akron941c1a62016-02-23 17:41:41 +0100844 };
845 }
846
847 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200848 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200849
Akron941c1a62016-02-23 17:41:41 +0100850 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200851 $log->error("Unzip is not installed or incompatible.");
852 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100853 };
854
Akron08385f62016-03-22 20:37:04 +0100855 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200856 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100857
Akron31a08cb2019-02-20 20:43:26 +0100858 # Get sigles to extract
859 my $prefix = set_sigle($archive);
860
Akron941c1a62016-02-23 17:41:41 +0100861 print "Start processing ...\n";
862 $t = Benchmark->new;
863 my @dirs = $archive->list_texts;
864 $count = scalar @dirs;
865
866 ARCHIVE_LOOP:
867 for (my $i = 0; $i < $count; $i++) {
868
869 # Split path information
870 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
871
Akrone1dbc382016-07-08 22:24:52 +0200872 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200873 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200874 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200875 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200876 catfile($corpus, $doc, $text)
877 . '.json' . ($gzip ? '.gz' : '')
878 )
Akrone1dbc382016-07-08 22:24:52 +0200879 );
Akron941c1a62016-02-23 17:41:41 +0100880
881 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200882 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100883
Akron4c0cf312016-10-15 16:42:09 +0200884 # Create temporary file
885 $temp = File::Temp->newdir;
886
Akronbdf434a2016-10-24 17:42:07 +0200887 # TODO: Check if $filename exist at the beginning,
888 # because extraction can be horrible slow!
889
Akron941c1a62016-02-23 17:41:41 +0100890 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100891 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100892
Akron7d4cdd82016-08-17 21:39:45 +0200893 # Create corpus directory
894 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100895
Akron7d4cdd82016-08-17 21:39:45 +0200896 # Temporary directory
897 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100898
Akron7d4cdd82016-08-17 21:39:45 +0200899 # Write file
Akron13d56622016-10-31 14:54:49 +0100900 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200901
Akron4c0cf312016-10-15 16:42:09 +0200902 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100903 $pool->finish(
904 0,
Akronda3097e2017-04-23 19:53:57 +0200905 [
906 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
907 $temp,
908 $filename
909 ]
Akron13d56622016-10-31 14:54:49 +0100910 );
Akron7d4cdd82016-08-17 21:39:45 +0200911 }
912 else {
Akron4c0cf312016-10-15 16:42:09 +0200913 # Delete temporary file
914 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200915 };
Akron941c1a62016-02-23 17:41:41 +0100916 }
Akron7d4cdd82016-08-17 21:39:45 +0200917
918 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100919 else {
Akron4c0cf312016-10-15 16:42:09 +0200920 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100921 };
922 };
923 }
924
925 else {
926 print "Input is neither a directory nor an archive.\n\n";
927 };
928
929 $pool->wait_all_children;
930
Akron11c80302016-03-18 19:44:43 +0100931 # Delete cache file
932 unlink($cache_file) if $cache_delete;
933
Akronda3097e2017-04-23 19:53:57 +0200934 # Close tar filehandle
935 if ($to_tar && $tar_fh) {
936 $tar_archive->finish;
937 $tar_fh->close;
938 print "Wrote to tar archive.\n";
939 };
940
Akron63f20d42017-04-10 23:40:29 +0200941 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100942 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200943};
Akron941c1a62016-02-23 17:41:41 +0100944
Nils Diewald2db9ad02013-10-29 19:26:43 +0000945
Akron31a08cb2019-02-20 20:43:26 +0100946# For an archive, this will create the list
947# of all sigles to process
948sub set_sigle {
949 my $archive = shift;
950
951 my $prefix = 1;
952 my @dirs = ();
953
954 # No sigles given
955 unless (@sigle) {
956
957 # Get files
958 foreach ($archive->list_texts) {
959
960 push @dirs, $_;
961
962 # Split path information
963 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
964
965 # TODO: Make this OS independent
966 push @sigle, join '/', $corpus, $doc, $text;
967 };
968 }
969
970 # Check sigle for doc sigles
971 else {
972 my @new_sigle;
973
974 my $prefix_check = 0;
975
976 # Iterate over all sigle
977 foreach (@sigle) {
978
979 # Sigle is a doc sigle
980 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
981
982 print "$_ ...";
983 # Check if a prefix is needed
984 unless ($prefix_check) {
985
986 if ($prefix = $archive->check_prefix) {
987 print " with prefix ...";
988 };
989 $prefix_check = 1;
990 };
991
992 print "\n";
993
Akron31a08cb2019-02-20 20:43:26 +0100994 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100995 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
996 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100997 );
998 print "extracted.\n";
999 }
1000
1001 # Sigle is a text sigle
1002 else {
1003 push @new_sigle, $_;
1004
1005 unless ($prefix_check) {
1006
1007 if ($prefix = $archive->check_prefix) {
1008 print " with prefix ...";
1009 };
1010 $prefix_check = 1;
1011 };
1012 };
1013 };
1014 @sigle = @new_sigle;
1015 };
1016
1017 return $prefix;
1018};
1019
1020
Akron63f20d42017-04-10 23:40:29 +02001021# Cleanup temporary extraction directory
1022if ($extract_dir) {
1023 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001024 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001025};
1026
1027
1028print "\n";
1029
Nils Diewald2db9ad02013-10-29 19:26:43 +00001030__END__
Akron941c1a62016-02-23 17:41:41 +01001031
1032=pod
1033
1034=encoding utf8
1035
1036=head1 NAME
1037
Akron42f48c12020-02-14 13:08:13 +01001038korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001039
1040
1041=head1 SYNOPSIS
1042
Akrona76d8352016-10-27 16:27:32 +02001043 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001044
Akron2fd402b2016-10-27 21:26:48 +02001045
Akron941c1a62016-02-23 17:41:41 +01001046=head1 DESCRIPTION
1047
1048L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1049compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001050The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001051
1052
1053=head1 INSTALLATION
1054
1055The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1056
Akronaf386982016-10-12 00:33:25 +02001057 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001058
Akronc13a1702016-03-15 19:33:14 +01001059In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001060be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001061Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akron0b04b312020-10-30 17:39:18 +01001062Optional support for L<Sys::Info> to calculate available cores.
Akrona93d51b2016-10-24 20:27:48 +02001063In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001064
1065=head1 ARGUMENTS
1066
Akrona76d8352016-10-27 16:27:32 +02001067 $ korapxml2krill -z --input <directory> --output <filename>
1068
1069Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001070It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001071
Akron941c1a62016-02-23 17:41:41 +01001072=over 2
1073
1074=item B<archive>
1075
Akron081639e2017-04-21 19:01:39 +02001076 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001077
Akron2fd402b2016-10-27 21:26:48 +02001078Converts an archive of KorAP-XML documents. It expects a directory
1079(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001080
1081=item B<extract>
1082
Akrona76d8352016-10-27 16:27:32 +02001083 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1084
1085Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001086
Akron63f20d42017-04-10 23:40:29 +02001087=item B<serial>
1088
1089 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1090
1091Convert archives sequentially. The inputs are not merged but treated
1092as they are (so they may be premerged or globs).
1093the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001094are created based on the archive name. In case the C<--to-tar> flag is given,
1095the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001096
1097
Akron84b53ad2022-01-14 12:39:15 +01001098=item B<slimlog>
1099
1100 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1101
1102Filters out all useless aka succesfull information from logs, to simplify
1103log checks. Expects no further options.
1104
1105
Akron941c1a62016-02-23 17:41:41 +01001106=back
1107
1108
1109=head1 OPTIONS
1110
1111=over 2
1112
Akrona76d8352016-10-27 16:27:32 +02001113=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001114
Akrona76d8352016-10-27 16:27:32 +02001115Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001116
Akron7606afa2016-10-25 16:23:49 +02001117Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001118document, while C<archive> expects a KorAP-XML corpus folder or a zip
1119file to batch process multiple files.
1120C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001121
Akrona76d8352016-10-27 16:27:32 +02001122C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001123that the first archive listed contains all primary data files
1124and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001125
Akron7606afa2016-10-25 16:23:49 +02001126 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001127
Akron821db3d2017-04-06 21:19:31 +02001128Input may also be defined using BSD glob wildcards.
1129
1130 -i 'file/news*.zip'
1131
1132The extended input array will be sorted in length order, so the shortest
1133path needs to contain all primary data files and all meta data files.
1134
Akron0c3e3752016-06-28 15:55:53 +02001135(The directory structure follows the base directory format,
1136that may include a C<.> root folder.
1137In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001138need to be passed with a hash sign in front of the archive's name.
1139This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001140
Akron7606afa2016-10-25 16:23:49 +02001141To support zip files, a version of C<unzip> needs to be installed that is
1142compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001143
Akron7606afa2016-10-25 16:23:49 +02001144B<The root folder switch using the hash sign is experimental and
1145may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001146
Akronf73ffb62018-06-27 12:13:59 +02001147
Akron63f20d42017-04-10 23:40:29 +02001148=item B<--input-base|-ib> <directory>
1149
1150The base directory for inputs.
1151
1152
Akron941c1a62016-02-23 17:41:41 +01001153=item B<--output|-o> <directory|file>
1154
1155Output folder for archive processing or
1156document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001157writes to C<STDOUT> by default
1158(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001159
1160=item B<--overwrite|-w>
1161
1162Overwrite files that already exist.
1163
Akronf73ffb62018-06-27 12:13:59 +02001164
Akron3741f8b2016-12-21 19:55:21 +01001165=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001166
1167Define the default tokenization by specifying
1168the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001169of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001170This will directly take the file instead of running
1171the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001172
Akron3741f8b2016-12-21 19:55:21 +01001173
1174=item B<--base-sentences|-bs> <foundry>#<layer>
1175
1176Define the layer for base sentences.
1177If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001178Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1179layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001180
1181 Defaults to unset.
1182
1183
1184=item B<--base-paragraphs|-bp> <foundry>#<layer>
1185
1186Define the layer for base paragraphs.
1187If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001188Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1189layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001190
1191 Defaults to unset.
1192
1193
Akron41ac10b2017-02-08 22:47:25 +01001194=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1195
1196Define the layer for base pagebreaks.
1197Currently C<DeReKo#Structure> is the only layer supported.
1198
1199 Defaults to unset.
1200
1201
Akron941c1a62016-02-23 17:41:41 +01001202=item B<--skip|-s> <foundry>[#<layer>]
1203
Akronf7ad89e2016-03-16 18:22:47 +01001204Skip specific annotations by specifying the foundry
1205(and optionally the layer with a C<#>-prefix),
1206e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001207Can be set multiple times.
1208
Akronf73ffb62018-06-27 12:13:59 +02001209
Akronc13a1702016-03-15 19:33:14 +01001210=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001211
Akronf7ad89e2016-03-16 18:22:47 +01001212Convert specific annotations by specifying the foundry
1213(and optionally the layer with a C<#>-prefix),
1214e.g. C<Mate> or C<Mate#Morpho>.
1215Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001216
Akronf73ffb62018-06-27 12:13:59 +02001217
Akroned9baf02019-01-22 17:03:25 +01001218=item B<--non-word-tokens|-nwt>
1219
1220Tokenize non-word tokens like word tokens (defined as matching
1221C</[\d\w]/>). Useful to treat punctuations as tokens.
1222
1223 Defaults to unset.
1224
Akronf1849aa2019-12-16 23:35:33 +01001225
1226=item B<--non-verbal-tokens|-nvt>
1227
1228Tokenize non-verbal tokens marked as in the primary data as
1229the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1230
1231 Defaults to unset.
1232
1233
Akron941c1a62016-02-23 17:41:41 +01001234=item B<--jobs|-j>
1235
1236Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001237for archive processing.
Akron11c80302016-03-18 19:44:43 +01001238Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001239
1240If C<sequential-extraction> is not set to false, this will
1241also apply to extraction.
1242
Akronc11f7982017-02-21 21:20:14 +01001243Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001244times the number of available cores, in case L<Sys::Info>
1245is available.
Akronf7ad89e2016-03-16 18:22:47 +01001246This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001247
Akronf73ffb62018-06-27 12:13:59 +02001248
Akron263274c2019-02-07 09:48:30 +01001249=item B<--koral|-k>
1250
1251Version of the output format. Supported versions are:
1252C<0> for legacy serialization, C<0.03> for serialization
1253with metadata fields as key-values on the root object,
1254C<0.4> for serialization with metadata fields as a list
1255of C<"@type":"koral:field"> objects.
1256
1257Currently defaults to C<0.03>.
1258
1259
Akron9ec88872017-04-12 16:29:06 +02001260=item B<--sequential-extraction|-se>
1261
1262Flag to indicate, if the C<jobs> value also applies to extraction.
1263Some systems may have problems with extracting multiple archives
1264to the same folder at the same time.
1265Can be flagged using C<--no-sequential-extraction> as well.
1266Defaults to C<false>.
1267
Akronf73ffb62018-06-27 12:13:59 +02001268
Akron35db6e32016-03-17 22:42:22 +01001269=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001270
Akron35db6e32016-03-17 22:42:22 +01001271Define the metadata parser to use. Defaults to C<I5>.
1272Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1273This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001274
Akronf73ffb62018-06-27 12:13:59 +02001275
Akron941c1a62016-02-23 17:41:41 +01001276=item B<--gzip|-z>
1277
Akronf7ad89e2016-03-16 18:22:47 +01001278Compress the output.
1279Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001280
Akronf73ffb62018-06-27 12:13:59 +02001281
Akron11c80302016-03-18 19:44:43 +01001282=item B<--cache|-c>
1283
1284File to mmap a cache (using L<Cache::FastMmap>).
1285Defaults to C<korapxml2krill.cache> in the calling directory.
1286
Akronf73ffb62018-06-27 12:13:59 +02001287
Akron11c80302016-03-18 19:44:43 +01001288=item B<--cache-size|-cs>
1289
1290Size of the cache. Defaults to C<50m>.
1291
Akronf73ffb62018-06-27 12:13:59 +02001292
Akron11c80302016-03-18 19:44:43 +01001293=item B<--cache-init|-ci>
1294
1295Initialize cache file.
1296Can be flagged using C<--no-cache-init> as well.
1297Defaults to C<true>.
1298
Akronf73ffb62018-06-27 12:13:59 +02001299
Akron11c80302016-03-18 19:44:43 +01001300=item B<--cache-delete|-cd>
1301
1302Delete cache file after processing.
1303Can be flagged using C<--no-cache-delete> as well.
1304Defaults to C<true>.
1305
Akronf73ffb62018-06-27 12:13:59 +02001306
Akron636aa112017-04-07 18:48:56 +02001307=item B<--config|-cfg>
1308
1309Configure the parameters of your call in a file
1310of key-value pairs with whitespace separator
1311
1312 overwrite 1
1313 token DeReKo#Structure
1314 ...
1315
1316Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001317C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001318C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001319C<output>, C<koral>,
1320C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001321C<base-sentences>, C<base-paragraphs>,
1322C<base-pagebreaks>,
1323C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001324(semicolon separated), C<anno> (semicolon separated).
1325
Akronf73ffb62018-06-27 12:13:59 +02001326Configuration parameters will always be overwritten by
1327passed parameters.
1328
1329
Akron81500102017-04-07 20:45:44 +02001330=item B<--temporary-extract|-te>
1331
1332Only valid for the C<archive> command.
1333
1334This will first extract all files into a
1335directory and then will archive.
1336If the directory is given as C<:temp:>,
1337a temporary directory is used.
1338This is especially useful to avoid
1339massive unzipping and potential
1340network latency.
Akron636aa112017-04-07 18:48:56 +02001341
Akronf73ffb62018-06-27 12:13:59 +02001342
Akronc93a0802019-07-11 15:48:34 +02001343=item B<--to-tar>
1344
1345Only valid for the C<archive> command.
1346
1347Writes the output into a tar archive.
1348
1349
Akrone10ad322016-02-27 10:54:26 +01001350=item B<--sigle|-sg>
1351
Akron20807582016-10-26 17:11:34 +02001352Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001353Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001354I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001355Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001356In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001357On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001358
Akronf73ffb62018-06-27 12:13:59 +02001359
Akron941c1a62016-02-23 17:41:41 +01001360=item B<--log|-l>
1361
Akronb9c33812020-10-21 16:19:35 +02001362The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001363
Akronf73ffb62018-06-27 12:13:59 +02001364
Akron941c1a62016-02-23 17:41:41 +01001365=item B<--help|-h>
1366
Akron42f48c12020-02-14 13:08:13 +01001367Print help information.
Akron941c1a62016-02-23 17:41:41 +01001368
Akronf73ffb62018-06-27 12:13:59 +02001369
Akron941c1a62016-02-23 17:41:41 +01001370=item B<--version|-v>
1371
1372Print version information.
1373
1374=back
1375
Akronf73ffb62018-06-27 12:13:59 +02001376
Akronc13a1702016-03-15 19:33:14 +01001377=head1 ANNOTATION SUPPORT
1378
1379L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1380developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1381The base foundry with paragraphs, sentences, and the text element are mandatory for
1382L<Krill|https://github.com/KorAP/Krill>.
1383
Akron821db3d2017-04-06 21:19:31 +02001384 Base
1385 #Paragraphs
1386 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001387
Akron821db3d2017-04-06 21:19:31 +02001388 Connexor
1389 #Morpho
1390 #Phrase
1391 #Sentences
1392 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001393
Akron821db3d2017-04-06 21:19:31 +02001394 CoreNLP
1395 #Constituency
1396 #Morpho
1397 #NamedEntities
1398 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001399
Akronce125b62017-06-19 11:54:36 +02001400 CMC
1401 #Morpho
1402
Akron821db3d2017-04-06 21:19:31 +02001403 DeReKo
1404 #Structure
Akronc13a1702016-03-15 19:33:14 +01001405
Akron57510c12019-01-04 14:58:53 +01001406 DGD
1407 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001408 #Structure
Akron57510c12019-01-04 14:58:53 +01001409
Akron821db3d2017-04-06 21:19:31 +02001410 DRuKoLa
1411 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001412
Akron821db3d2017-04-06 21:19:31 +02001413 Glemm
1414 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001415
Akronabb36902021-10-11 15:51:06 +02001416 Gingko
1417 #Morpho
1418
Akronea1aed52018-07-19 14:43:34 +02001419 HNC
1420 #Morpho
1421
Akron4c679192018-01-16 17:41:49 +01001422 LWC
1423 #Dependency
1424
Akron821db3d2017-04-06 21:19:31 +02001425 Malt
1426 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001427
Akron821db3d2017-04-06 21:19:31 +02001428 MarMoT
1429 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001430
Akron821db3d2017-04-06 21:19:31 +02001431 Mate
1432 #Dependency
1433 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001434
Akron821db3d2017-04-06 21:19:31 +02001435 MDParser
1436 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001437
Akron821db3d2017-04-06 21:19:31 +02001438 OpenNLP
1439 #Morpho
1440 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001441
Akron07e24772020-04-23 14:00:54 +02001442 RWK
1443 #Morpho
1444 #Structure
1445
Akron821db3d2017-04-06 21:19:31 +02001446 Sgbr
1447 #Lemma
1448 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001449
Akron7d5e6382019-08-08 16:36:27 +02001450 Talismane
1451 #Dependency
1452 #Morpho
1453
Akron821db3d2017-04-06 21:19:31 +02001454 TreeTagger
1455 #Morpho
1456 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001457
Akron821db3d2017-04-06 21:19:31 +02001458 XIP
1459 #Constituency
1460 #Morpho
1461 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001462
Akronc13a1702016-03-15 19:33:14 +01001463
1464More importers are in preparation.
1465New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1466See the built-in annotation importers as examples.
1467
Akronf73ffb62018-06-27 12:13:59 +02001468
Akron41e6c8b2021-10-14 20:22:18 +02001469=head1 METADATA SUPPORT
1470
1471L<KorAP::XML::Krill> has built-in importer for some meta data variants
1472developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1473
1474=over 2
1475
1476=item I5 - Meta data for all I5 files
1477
1478=item Sgbr - Meta data from the Schreibgebrauch project
1479
1480=item Gingko - Meta data from the Gingko project in addition to I5
1481
1482=back
1483
1484More importers are in preparation.
1485New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1486See the built-in meta data importers as examples.
1487
1488
Akron8f69d632020-01-15 16:58:11 +01001489=head1 About KorAP-XML
1490
1491KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1492data model (Bański et al. 2013), where text data are stored physically
1493separated from their interpretations (i.e. annotations).
1494A text document in KorAP-XML therefore consists of several files
1495containing primary data, metadata and annotations.
1496
1497The structure of a single KorAP-XML document can be as follows:
1498
1499 - data.xml
1500 - header.xml
1501 + base
1502 - tokens.xml
1503 - ...
1504 + struct
1505 - structure.xml
1506 - ...
1507 + corenlp
1508 - morpho.xml
1509 - constituency.xml
1510 - ...
1511 + tree_tagger
1512 - morpho.xml
1513 - ...
1514 - ...
1515
1516The C<data.xml> contains the primary data, the C<header.xml> contains
1517the metadata, and the annotation layers are stored in subfolders
1518like C<base>, C<struct> or C<corenlp>
1519(so-called "foundries"; Bański et al. 2013).
1520
1521Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001522(Lüngen and Sperberg-McQueen 2012). See the documentation in
1523L<KorAP::XML::Meta::I5> for translatable fields.
1524
1525Annotations correspond to a variant of the TEI-P5 feature structures
1526(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001527Annotation feature structures refer to character sequences of the primary text
1528inside the C<text> element of the C<data.xml>.
1529A single annotation containing the lemma of a token can have the following structure:
1530
1531 <span from="0" to="3">
1532 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1533 <f name="lex">
1534 <fs>
1535 <f name="lemma">zum</f>
1536 </fs>
1537 </f>
1538 </fs>
1539 </span>
1540
1541The C<from> and C<to> attributes are refering to the character span
1542in the primary text.
1543Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1544the structure may vary. See L<KorAP::XML::Annotation::*> for various
1545annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001546
1547Multiple KorAP-XML documents are organized on three levels following
1548the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1549corpus E<gt> document E<gt> text. On each level metadata information
1550can be stored, that C<korapxml2krill> will merge to a single metadata
1551object per text. A corpus is therefore structured as follows:
1552
1553 + <corpus>
1554 - header.xml
1555 + <document>
1556 - header.xml
1557 + <text>
1558 - data.xml
1559 - header.xml
1560 - ...
1561 - ...
1562
1563A single text can be identified by the concatenation of
1564the corpus identifier, the document identifier and the text identifier.
1565This identifier is called the text sigle
1566(e.g. a text with the identifier C<18486> in the document C<060> in the
1567corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1568
1569These corpora are often stored in zip files, with which C<korapxml2krill>
1570can deal with. Corpora may also be split in multiple zip archives
1571(e.g. one zip file per foundry), which is also supported (see C<--input>).
1572
1573Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1574in form of a test suite.
1575The resulting JSON format merges all annotation layers
1576based on a single token stream.
1577
1578=head2 References
1579
1580Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1581KorAP data model: first approximation, December.
1582
1583Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1584"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1585Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1586L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1587
1588Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1589"Robust corpus architecture: a new look at virtual collections and data access",
1590Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1591L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1592
1593Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1594Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1595"Towards an international standard on featurestructure representation",
1596Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1597pp. 373-376.
1598L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1599
1600Harald Lüngen and C. M. Sperberg-McQueen (2012):
1601"A TEI P5 Document Grammar for the IDS Text Model",
1602Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1603L<PDF|https://journals.openedition.org/jtei/pdf/508>
1604
1605TEI Consortium, eds:
1606"Feature Structures",
1607Guidelines for Electronic Text Encoding and Interchange.
1608L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1609
Akron941c1a62016-02-23 17:41:41 +01001610=head1 AVAILABILITY
1611
1612 https://github.com/KorAP/KorAP-XML-Krill
1613
1614
1615=head1 COPYRIGHT AND LICENSE
1616
Akron9a2545e2022-01-16 15:15:50 +01001617Copyright (C) 2015-2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001618
Akron6882d7d2021-02-08 09:43:57 +01001619Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001620
Akrona76d8352016-10-27 16:27:32 +02001621Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001622
Akron6882d7d2021-02-08 09:43:57 +01001623L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001624Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001625L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001626member of the
Akronf1849aa2019-12-16 23:35:33 +01001627L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001628
1629This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001630L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001631
1632=cut