blob: e909b09f99eb97e5a2d86b4b6792a851004a7680 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akroneb370a02022-02-24 13:33:40 +010019use KorAP::XML::TarBuilder;
Akron93d620e2016-02-05 19:40:05 +010020use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020021use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020022use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010023use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akronabb36902021-10-11 15:51:06 +0200154#
155# 2021/10/11
156# - Introduced support for Gingko
Akron9a2545e2022-01-16 15:15:50 +0100157#
158# 2022/01/17
159# - Support for temporary extraction in config
Akron84b53ad2022-01-14 12:39:15 +0100160# - Introduced support for Gingko
Akrona65cd682022-07-21 15:40:40 +0200161#
162# 2022/07/21
163# - Support for NKJP
Akron941c1a62016-02-23 17:41:41 +0100164# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100165
Akrona65cd682022-07-21 15:40:40 +0200166our $LAST_CHANGE = '2022/07/21';
Akron941c1a62016-02-23 17:41:41 +0100167our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100168our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100169our $VERSION_MSG = <<"VERSION";
170Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
171VERSION
172
Akron941c1a62016-02-23 17:41:41 +0100173# Parse comand
174my $cmd;
175our @ARGV;
176if ($ARGV[0] && index($ARGV[0], '-') != 0) {
177 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100178};
Akron63f20d42017-04-10 23:40:29 +0200179my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100180
Akron5f51d422016-08-16 16:26:43 +0200181my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200182
183# Configuration hash
184my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100185
Akron941c1a62016-02-23 17:41:41 +0100186# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000187GetOptions(
Akron08385f62016-03-22 20:37:04 +0100188 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200189 'input-base|ib=s' => \($cfg{input_base}),
190 'output|o=s' => \($cfg{output}),
191 'overwrite|w' => \($cfg{overwrite}),
192 'meta|m=s' => \($cfg{meta}),
193 'token|t=s' => \($cfg{token}),
194 'base-sentences|bs=s' => \($cfg{base_sentences}),
195 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
196 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
197 'gzip|z' => \($cfg{gzip}),
198 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100199 'skip|s=s' => \@skip,
200 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200201 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200202 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200203 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200204 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200205 'primary|p!' => sub {
206 warn 'Primary flag no longer supported!';
207 },
Akron6aed0562020-08-07 16:46:00 +0200208 'pretty|y' => sub {
209 warn 'Pretty flag no longer supported!';
210 },
Akronf8df2162020-08-07 15:03:39 +0200211 'jobs|j=i' => \($cfg{jobs}),
212 'koral|k=f' => \($cfg{koral}),
213 'to-tar' => \($cfg{to_tar}),
214 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
215 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
216 'sequential-extraction|se' => \($cfg{sequential_extraction}),
217 'cache-size|cs=s' => \($cfg{cache_size}),
218 'cache-delete|cd!' => \($cfg{cache_delete}),
219 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100220 'help|h' => sub {
221 pod2usage(
222 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200223 -verbose => 99,
224 -msg => $VERSION_MSG,
225 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100226 );
227 },
228 'version|v' => sub {
229 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200230 -verbose => 0,
231 -msg => $VERSION_MSG,
232 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100233 )
234 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000235);
236
Akrone512b7c2020-08-07 16:16:12 +0200237my %ERROR_HASH = (
238 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
239 -verbose => 99,
240 -msg => $VERSION_MSG,
241 -output => '-',
242 -exit => 1
243);
Akron63f20d42017-04-10 23:40:29 +0200244
Akronf8df2162020-08-07 15:03:39 +0200245# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200246if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200247 my %config;
248
Akronf8df2162020-08-07 15:03:39 +0200249 print "Reading config from $cfg_file\n";
250
Akron636aa112017-04-07 18:48:56 +0200251 Config::Simple->import_from($cfg_file, \%config);
252
Akronf8df2162020-08-07 15:03:39 +0200253 foreach (qw!output cache-size input-base token overwrite
254 meta base-sentences base-paragraphs base-pagebreaks
255 gzip to-tar log cache non-word-tokens
Akron9a2545e2022-01-16 15:15:50 +0100256 non-verbal-tokens sequential-extraction
257 temporary-extract cache-init
Akronf8df2162020-08-07 15:03:39 +0200258 koral extract-dir jobs!) {
259 my $underlined = $_ =~ tr/-/_/r;
260 if (!defined($cfg{$underlined}) && defined $config{$_}) {
261 $cfg{$underlined} = $config{$_};
262 };
Akron636aa112017-04-07 18:48:56 +0200263 };
264
265 # Skip
266 if (!scalar(@skip) && defined $config{'skip'}) {
267 @skip = split /\s*;\s*/, $config{'skip'} ;
268 };
269
270 # Sigle
271 if (!scalar(@sigle) && defined $config{'sigle'}) {
272 @sigle = split /\s*;\s*/, $config{'sigle'} ;
273 };
274
275 # Anno
276 if (!scalar(@anno) && defined $config{'anno'}) {
277 @anno = split /\s*;\s*/, $config{'anno'} ;
278 };
279};
280
Akronf8df2162020-08-07 15:03:39 +0200281# Init variables and set default values
282my $output = $cfg{output};
283my $input_base = $cfg{input_base};
284my $gzip = $cfg{gzip};
285my $to_tar = $cfg{to_tar};
286my $extract_dir = $cfg{extract_dir};
287my $token_base = $cfg{token} // 'OpenNLP#tokens';
288my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
289my $jobs = $cfg{jobs} // 0;
290my $cache_delete = $cfg{cache_delete} // 1;
291my $base_sentences = lc($cfg{base_sentences} // '');
292my $base_paragraphs = lc($cfg{base_paragraphs} // '');
293my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
294my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200295
Akronf8df2162020-08-07 15:03:39 +0200296# Get tokenization basis
297my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200298
Akronf8df2162020-08-07 15:03:39 +0200299# Remove file extension
300$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100301
Akronf8df2162020-08-07 15:03:39 +0200302# Convert sigle to path construct
303s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
304
305my %skip;
306$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200307
Akronb9c33812020-10-21 16:19:35 +0200308Log::Any::Adapter->set(
309 'Stderr', log_level => uc($cfg{log} // 'ERROR')
310);
Akron63f20d42017-04-10 23:40:29 +0200311
Akron84b53ad2022-01-14 12:39:15 +0100312# Start log slimming
313if ($cmd && $cmd eq 'slimlog') {
314 require KorAP::XML::Log::Slim;
315
316 my $log_file = shift @ARGV;
317
318 if (-e $log_file) {
319
320 my $slimmer = KorAP::XML::Log::Slim->new($log_file);
321
322 # Run log filter
323 $slimmer->slim_to;
324 }
325
326 else {
327 warn "Log file can't be found";
328 exit(1);
329 };
330
331 exit;
332};
333
334
Akronf8df2162020-08-07 15:03:39 +0200335if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
336 $log->error("Directory '$output' does not exist.");
337 exit 1;
338};
Akron63f20d42017-04-10 23:40:29 +0200339
Akron941c1a62016-02-23 17:41:41 +0100340# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100341pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000342
Akrone1dbc382016-07-08 22:24:52 +0200343# Gzip has no effect, if no output is given
344pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000345
Akron63f20d42017-04-10 23:40:29 +0200346# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200347if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200348
Akron63f20d42017-04-10 23:40:29 +0200349 # Remove all inputs
350 my $remove_next = 0;
351 @keep_argv = @{c(@keep_argv)->grep(
352 sub {
353 # Input flag
354 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
355 $remove_next = 1;
356 return 0;
357 }
358
359 # input value
360 elsif ($remove_next) {
361 $remove_next = 0;
362 return 0;
363 };
364
365 # Pass parameter
366 return 1;
367 }
368 )->to_array};
369
370
371 # Iterate over all inputs
372 foreach (@input) {
373
Akron081639e2017-04-21 19:01:39 +0200374 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200375 my $new_out = catdir($output, get_file_name_from_glob($_));
376
Akron486f9ab2017-04-22 23:25:19 +0200377 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200378 unless ($to_tar) {
379 if (make_path($new_out) == 0 && !-d $new_out) {
380 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200381 exit 1;
Akron081639e2017-04-21 19:01:39 +0200382 };
Akron63f20d42017-04-10 23:40:29 +0200383 };
384
385 # Create archive command
386 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
387 print "Start serial processing of $_ to $new_out\n";
388
389 # Start archiving
390 system @archive_cmd;
391 };
392
Akron3abc03e2017-06-29 16:23:35 +0200393 exit;
Akron63f20d42017-04-10 23:40:29 +0200394};
395
Akron5c602cb2020-08-07 17:00:52 +0200396# Define supported (and preinstalled) transformation modules
397my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100398push(@layers, ['Base', 'Sentences']) unless $base_sentences;
399push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200400
401# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200402push(@layers, ['Connexor', 'Morpho'],
403 ['Connexor', 'Syntax'],
404 ['Connexor', 'Phrase'],
405 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200406
407# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200408push(@layers,
409 ['CoreNLP', 'NamedEntities'],
410 ['CoreNLP', 'Sentences'],
411 ['CoreNLP', 'Morpho'],
412 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200413
Akronce125b62017-06-19 11:54:36 +0200414# CMC
415push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100416
Akrone1dbc382016-07-08 22:24:52 +0200417# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100418my @dereko_attr = ();
419if ($base_sentences eq 'dereko#structure') {
420 push @dereko_attr, 'sentences';
421};
422if ($base_paragraphs eq 'dereko#structure') {
423 push @dereko_attr, 'paragraphs';
424};
Akron636bd9c2017-02-09 17:13:00 +0100425
Akron41ac10b2017-02-08 22:47:25 +0100426if ($base_pagebreaks eq 'dereko#structure') {
427 push @dereko_attr, 'pagebreaks';
428};
429
430if ($dereko_attr[0]) {
431 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100432}
433else {
434 push(@layers, ['DeReKo', 'Structure']);
435};
Akrone1dbc382016-07-08 22:24:52 +0200436
Akron57510c12019-01-04 14:58:53 +0100437# DGD
438push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100439if ($base_sentences eq 'dgd#structure') {
440 push(@layers, ['DGD', 'Structure', 'base-sentence']);
441}
Akron57510c12019-01-04 14:58:53 +0100442
443# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200444push(@layers,
445 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100446
Akronabb36902021-10-11 15:51:06 +0200447# Gingko
448push(@layers,
449 ['Gingko', 'Morpho']);
450
Akrone1dbc382016-07-08 22:24:52 +0200451# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200452push(@layers,
453 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200454
Akronea1aed52018-07-19 14:43:34 +0200455# HNC
Akron5c602cb2020-08-07 17:00:52 +0200456push(@layers,
457 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200458
Akron4c679192018-01-16 17:41:49 +0100459# LWC
Akron5c602cb2020-08-07 17:00:52 +0200460push(@layers,
461 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100462
Akrone1dbc382016-07-08 22:24:52 +0200463# Malt
Akron5c602cb2020-08-07 17:00:52 +0200464push(@layers,
465 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200466
Akron57510c12019-01-04 14:58:53 +0100467# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200468push(@layers,
469 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200470
471# Mate
Akron5c602cb2020-08-07 17:00:52 +0200472push(@layers,
473 ['Mate', 'Morpho'],
474 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200475
Akron57510c12019-01-04 14:58:53 +0100476# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200477push(@layers,
478 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100479
Akron88d063a2022-03-21 15:10:01 +0100480# NKJP
481push(@layers,
Akrona65cd682022-07-21 15:40:40 +0200482 ['NKJP', 'Morpho'],
483 ['NKJP', 'NamedEntities']);
Akron88d063a2022-03-21 15:10:01 +0100484
Akrone1dbc382016-07-08 22:24:52 +0200485# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200486push(@layers,
487 ['OpenNLP', 'Morpho'],
488 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200489
Akron07e24772020-04-23 14:00:54 +0200490# Redewiedergabe
491push(@layers, ['RWK', 'Morpho']);
492if ($base_sentences eq 'rwk#structure') {
493 push(@layers, ['RWK', 'Structure']);
494};
495
Akrone1dbc382016-07-08 22:24:52 +0200496# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200497push(@layers,
498 ['Sgbr', 'Lemma'],
499 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200500
Akron7d5e6382019-08-08 16:36:27 +0200501# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200502push(@layers,
503 ['Talismane', 'Dependency'],
504 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200505
Akrone1dbc382016-07-08 22:24:52 +0200506# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200507push(@layers,
508 ['TreeTagger', 'Morpho'],
509 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200510
511# XIP
Akron5c602cb2020-08-07 17:00:52 +0200512push(@layers,
513 ['XIP', 'Morpho'],
514 ['XIP', 'Constituency'],
515 ['XIP', 'Sentences'],
516 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200517
Akron4fa37c32017-01-20 14:43:10 +0100518
Akrone1dbc382016-07-08 22:24:52 +0200519# Check filters
520my @filtered_anno;
521if ($skip{'#all'}) {
522 foreach (@anno) {
523 push @filtered_anno, [ split('#', $_) ];
524 };
525}
526
527# Add all annotations that are not skipped
528else {
529 # Add to index file - respect skipping
530 foreach my $info (@layers) {
531 # Skip if Foundry or Foundry#Layer should be skipped
532 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
533 push @filtered_anno, $info;
534 };
535 };
536};
537
Akrone1dbc382016-07-08 22:24:52 +0200538
539# TODO: This should not be initialized for batch
540my $cache = Cache::FastMmap->new(
541 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200542 cache_size => ($cfg{cache_size} // '50m'),
543 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200544);
545
Akron03b24db2016-08-16 20:54:32 +0200546# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200547my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200548 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200549 meta_type => $cfg{meta},
550 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200551 foundry => $token_base_foundry,
552 layer => $token_base_layer,
553 gzip => $gzip,
554 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200555 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100556 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200557 non_word_tokens => ($cfg{non_word_tokens} // 0),
558 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200559);
560
Akrone512b7c2020-08-07 16:16:12 +0200561
562# Auto adjust jobs
563if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100564 my $cores = 1;
565 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
566 $cores = Sys::Info->new->device('CPU')->count;
567 }
568 else {
569 $log->warn("Unable to determine number of cores");
570 };
571
Akrone512b7c2020-08-07 16:16:12 +0200572 $jobs = ceil(5 * $cores);
573 $log->info("Run using $jobs jobs on $cores cores");
574};
575
576
Akron63f20d42017-04-10 23:40:29 +0200577# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200578if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200579
Akron821db3d2017-04-06 21:19:31 +0200580 my @new_input = ();
581
582 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200583 foreach my $wild_card (@input) {
584
585 # Prefix with input root
586 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
587
588 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200589 };
590
Akron63f20d42017-04-10 23:40:29 +0200591 # Sort files by length
592 @input = sort { length($a) <=> length($b) } @new_input;
593
594 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200595};
596
597
Akron941c1a62016-02-23 17:41:41 +0100598# Process a single file
599unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100600 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000601
Akron941c1a62016-02-23 17:41:41 +0100602 BEGIN {
603 $main::TIME = Benchmark->new;
604 $main::LAST_STOP = Benchmark->new;
605 };
606
607 sub stop_time {
608 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200609 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100610 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200611 timestr(timediff($new, $main::LAST_STOP)) .
612 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
613 );
Akron941c1a62016-02-23 17:41:41 +0100614 $main::LAST_STOP = $new;
615 };
616
617 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200618 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100619
Akron7d4cdd82016-08-17 21:39:45 +0200620 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200621 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100622
Akron11c80302016-03-18 19:44:43 +0100623 # Delete cache file
624 unlink($cache_file) if $cache_delete;
625
Akron5f51d422016-08-16 16:26:43 +0200626 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200627 exit;
Akron81500102017-04-07 20:45:44 +0200628};
629
Nils Diewald59094f22014-11-05 18:20:50 +0000630
Akrone10ad322016-02-27 10:54:26 +0100631# Extract XML files
Akron81500102017-04-07 20:45:44 +0200632if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100633
Akrond5643ad2017-07-04 20:27:13 +0200634 # Output is required
635 pod2usage(%ERROR_HASH) unless $output;
636
Akron7d4cdd82016-08-17 21:39:45 +0200637 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200638 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100639
Akron7d4cdd82016-08-17 21:39:45 +0200640 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100641 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200642 $log->error("Unzip is not installed or incompatible.");
643 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100644 };
645
Akronb0c88db2016-06-29 16:33:18 +0200646 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200647 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200648
Akron31a08cb2019-02-20 20:43:26 +0100649 # Will set @sigle
650 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200651
Akrone10ad322016-02-27 10:54:26 +0100652 # Iterate over all given sigles and extract
653 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100654
Akron2812ba22016-10-28 21:55:59 +0200655 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200656
Akron03b24db2016-08-16 20:54:32 +0200657 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200658 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100659
Akron955b75b2019-02-21 14:28:41 +0100660 # TODO:
661 # - prefix???
662 $archive->extract_sigle([$_], $output, $jobs)
663 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200664 );
Akrone10ad322016-02-27 10:54:26 +0100665 print "extracted.\n";
666 };
Akronb0c88db2016-06-29 16:33:18 +0200667 }
Akron7d4cdd82016-08-17 21:39:45 +0200668
669 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200670 else {
671 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200672 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100673 };
674}
675
Akron81500102017-04-07 20:45:44 +0200676
Akron941c1a62016-02-23 17:41:41 +0100677# Process an archive
678elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000679
Akron81500102017-04-07 20:45:44 +0200680 my $archive_output;
681
682 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100683 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200684
685 # Create new archive object
686 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
687
688 # Check zip capabilities
689 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200690 $log->error("Unzip is not installed or incompatible.");
691 exit 1;
Akron81500102017-04-07 20:45:44 +0200692 };
693
694 # Add further annotation archived
695 $archive->attach($_) foreach @input[1..$#input];
696
697 # Create a temporary directory
698 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200699 $extract_dir = tempdir(CLEANUP => 0);
700 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200701 };
702
Akron63f20d42017-04-10 23:40:29 +0200703 # Add some random extra to avoid clashes with multiple archives
704 $extract_dir = catdir($extract_dir, random_string('cccccc'));
705
Akron31a08cb2019-02-20 20:43:26 +0100706 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200707 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200708 @input = ($extract_dir);
709 }
710 else {
711 $log->error('Unable to extract from primary archive ' . $input[0] .
712 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200713 exit 1;
Akron81500102017-04-07 20:45:44 +0200714 };
715 }
716
717 # Can't create archive object
718 else {
719 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200720 exit 1;
Akron81500102017-04-07 20:45:44 +0200721 };
722 };
723
Akron7d4cdd82016-08-17 21:39:45 +0200724 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100725 my $pool = Parallel::ForkManager->new($jobs);
726
Akron7d4cdd82016-08-17 21:39:45 +0200727 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100728 my $iter = 1; # Current text in process
729
Akronda3097e2017-04-23 19:53:57 +0200730 my $tar_archive;
731 my $output_dir = $output;
732 my $tar_fh;
733
734 # Initialize tar archive
735 if ($to_tar) {
Akronda3097e2017-04-23 19:53:57 +0200736
737 # Set output name
738 my $tar_file = $output;
739 unless ($tar_file =~ /\.tar$/) {
740 $tar_file .= '.tar';
741 };
742
743 # Initiate the tar file
744 print "Writing to file $tar_file\n";
745 $tar_fh = IO::File->new($tar_file, 'w');
746 $tar_fh->binmode(1);
747
Akroneb370a02022-02-24 13:33:40 +0100748 # Use tar builder for archiving
749 if (eval("use Archive::Tar::Builder; 1;")) {
750 $tar_archive = Archive::Tar::Builder->new(
751 ignore_errors => 1
752 );
753
754 # Set handle
755 $tar_archive->set_handle($tar_fh);
756 }
757
758 # Fallback solution
759 else {
760 $tar_archive = KorAP::XML::TarBuilder->new(
761 $tar_fh
762 );
763 };
Akronda3097e2017-04-23 19:53:57 +0200764
765 # Output to temporary directory
766 $output_dir = File::Temp->newdir;
767 };
768
Akron941c1a62016-02-23 17:41:41 +0100769 # Report on fork message
770 $pool->run_on_finish (
771 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200772 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100773 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200774
Akron08385f62016-03-22 20:37:04 +0100775 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200776 ($iter++) . "/$count]" .
777 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200778 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200779
780 if (!$code && $to_tar && $data->[2]) {
781 my $filename = $data->[2];
782
783 # Lock filehandle
784 if (flock($tar_fh, LOCK_EX)) {
785
Akron9a062ce2017-07-04 19:12:05 +0200786 my $clean_file = fileparse($filename);
787
Akronda3097e2017-04-23 19:53:57 +0200788 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200789 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200790 unlink $filename;
791
792 # Unlock filehandle
793 flock($tar_fh, LOCK_UN);
794 }
795 else {
796 $log->warn("Unable to add $filename to archive");
797 };
798 };
799
Akron4c0cf312016-10-15 16:42:09 +0200800 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100801 }
802 );
803
804 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200805 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100806 print "Reading data ...\n";
807
Akron7d4cdd82016-08-17 21:39:45 +0200808 # unless (Cache::FastMmap->new(
809 # share_file => $cache_file,
810 # cache_size => $cache_size,
811 # init_file => $cache_init
812 # )) {
813 # print "Unable to intialize cache '$cache_file'\n\n";
814 # exit(1);
815 # };
Akron11c80302016-03-18 19:44:43 +0100816
Akron486f9ab2017-04-22 23:25:19 +0200817
Akron941c1a62016-02-23 17:41:41 +0100818 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100819 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200820 # TODO:
821 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100822 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100823 my @dirs;
824 my $dir;
825
Akron7d4cdd82016-08-17 21:39:45 +0200826 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100827 while (1) {
828 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200829 push @dirs, $dir;
830 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100831 };
832 last unless $it->next;
833 };
834
835 print "Start processing ...\n";
836 $t = Benchmark->new;
837 $count = scalar @dirs;
838
839 DIRECTORY_LOOP:
840 for (my $i = 0; $i < $count; $i++) {
841
Akrone1dbc382016-07-08 22:24:52 +0200842 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200843 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200844 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200845 );
Akron941c1a62016-02-23 17:41:41 +0100846
847 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200848 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200849
Akron13d56622016-10-31 14:54:49 +0100850 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200851 $pool->finish(
852 0,
Akronda3097e2017-04-23 19:53:57 +0200853 [
854 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
855 undef,
856 $filename
857 ]
Akron486f9ab2017-04-22 23:25:19 +0200858 );
Akron3ec48972016-08-17 23:24:52 +0200859 }
860 else {
Akron4c0cf312016-10-15 16:42:09 +0200861 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200862 };
Akron941c1a62016-02-23 17:41:41 +0100863 };
864 }
865
866 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200867 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200868
Akron941c1a62016-02-23 17:41:41 +0100869 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200870 $log->error("Unzip is not installed or incompatible.");
871 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100872 };
873
Akron08385f62016-03-22 20:37:04 +0100874 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200875 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100876
Akron31a08cb2019-02-20 20:43:26 +0100877 # Get sigles to extract
878 my $prefix = set_sigle($archive);
879
Akron941c1a62016-02-23 17:41:41 +0100880 print "Start processing ...\n";
881 $t = Benchmark->new;
882 my @dirs = $archive->list_texts;
883 $count = scalar @dirs;
884
885 ARCHIVE_LOOP:
886 for (my $i = 0; $i < $count; $i++) {
887
888 # Split path information
889 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
890
Akrone1dbc382016-07-08 22:24:52 +0200891 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200892 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200893 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200894 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200895 catfile($corpus, $doc, $text)
896 . '.json' . ($gzip ? '.gz' : '')
897 )
Akrone1dbc382016-07-08 22:24:52 +0200898 );
Akron941c1a62016-02-23 17:41:41 +0100899
900 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200901 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100902
Akron4c0cf312016-10-15 16:42:09 +0200903 # Create temporary file
904 $temp = File::Temp->newdir;
905
Akronbdf434a2016-10-24 17:42:07 +0200906 # TODO: Check if $filename exist at the beginning,
907 # because extraction can be horrible slow!
908
Akron941c1a62016-02-23 17:41:41 +0100909 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100910 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100911
Akron7d4cdd82016-08-17 21:39:45 +0200912 # Create corpus directory
913 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100914
Akron7d4cdd82016-08-17 21:39:45 +0200915 # Temporary directory
916 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100917
Akron7d4cdd82016-08-17 21:39:45 +0200918 # Write file
Akron13d56622016-10-31 14:54:49 +0100919 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200920
Akron4c0cf312016-10-15 16:42:09 +0200921 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100922 $pool->finish(
923 0,
Akronda3097e2017-04-23 19:53:57 +0200924 [
925 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
926 $temp,
927 $filename
928 ]
Akron13d56622016-10-31 14:54:49 +0100929 );
Akron7d4cdd82016-08-17 21:39:45 +0200930 }
931 else {
Akron4c0cf312016-10-15 16:42:09 +0200932 # Delete temporary file
933 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200934 };
Akron941c1a62016-02-23 17:41:41 +0100935 }
Akron7d4cdd82016-08-17 21:39:45 +0200936
937 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100938 else {
Akron4c0cf312016-10-15 16:42:09 +0200939 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100940 };
941 };
942 }
943
944 else {
945 print "Input is neither a directory nor an archive.\n\n";
946 };
947
948 $pool->wait_all_children;
949
Akron11c80302016-03-18 19:44:43 +0100950 # Delete cache file
951 unlink($cache_file) if $cache_delete;
952
Akronda3097e2017-04-23 19:53:57 +0200953 # Close tar filehandle
954 if ($to_tar && $tar_fh) {
955 $tar_archive->finish;
956 $tar_fh->close;
957 print "Wrote to tar archive.\n";
958 };
959
Akron63f20d42017-04-10 23:40:29 +0200960 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100961 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200962};
Akron941c1a62016-02-23 17:41:41 +0100963
Nils Diewald2db9ad02013-10-29 19:26:43 +0000964
Akron31a08cb2019-02-20 20:43:26 +0100965# For an archive, this will create the list
966# of all sigles to process
967sub set_sigle {
968 my $archive = shift;
969
970 my $prefix = 1;
971 my @dirs = ();
972
973 # No sigles given
974 unless (@sigle) {
975
976 # Get files
977 foreach ($archive->list_texts) {
978
979 push @dirs, $_;
980
981 # Split path information
982 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
983
984 # TODO: Make this OS independent
985 push @sigle, join '/', $corpus, $doc, $text;
986 };
987 }
988
989 # Check sigle for doc sigles
990 else {
991 my @new_sigle;
992
993 my $prefix_check = 0;
994
995 # Iterate over all sigle
996 foreach (@sigle) {
997
998 # Sigle is a doc sigle
999 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1000
1001 print "$_ ...";
1002 # Check if a prefix is needed
1003 unless ($prefix_check) {
1004
1005 if ($prefix = $archive->check_prefix) {
1006 print " with prefix ...";
1007 };
1008 $prefix_check = 1;
1009 };
1010
1011 print "\n";
1012
Akron31a08cb2019-02-20 20:43:26 +01001013 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001014 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1015 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001016 );
1017 print "extracted.\n";
1018 }
1019
1020 # Sigle is a text sigle
1021 else {
1022 push @new_sigle, $_;
1023
1024 unless ($prefix_check) {
1025
1026 if ($prefix = $archive->check_prefix) {
1027 print " with prefix ...";
1028 };
1029 $prefix_check = 1;
1030 };
1031 };
1032 };
1033 @sigle = @new_sigle;
1034 };
1035
1036 return $prefix;
1037};
1038
1039
Akron63f20d42017-04-10 23:40:29 +02001040# Cleanup temporary extraction directory
1041if ($extract_dir) {
1042 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +02001043 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +02001044};
1045
1046
1047print "\n";
1048
Nils Diewald2db9ad02013-10-29 19:26:43 +00001049__END__
Akron941c1a62016-02-23 17:41:41 +01001050
1051=pod
1052
1053=encoding utf8
1054
1055=head1 NAME
1056
Akron42f48c12020-02-14 13:08:13 +01001057korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001058
1059
1060=head1 SYNOPSIS
1061
Akrona76d8352016-10-27 16:27:32 +02001062 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001063
Akron2fd402b2016-10-27 21:26:48 +02001064
Akron941c1a62016-02-23 17:41:41 +01001065=head1 DESCRIPTION
1066
1067L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1068compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001069The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001070
1071
1072=head1 INSTALLATION
1073
1074The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1075
Akronaf386982016-10-12 00:33:25 +02001076 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001077
Akronc13a1702016-03-15 19:33:14 +01001078In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001079be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001080Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akroneb370a02022-02-24 13:33:40 +01001081Optionally installing L<Archive::Tar::Builder> speeds up archive building.
1082Optional support for L<Sys::Info> to calculate available cores is available.
Akrona93d51b2016-10-24 20:27:48 +02001083In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001084
1085=head1 ARGUMENTS
1086
Akrona76d8352016-10-27 16:27:32 +02001087 $ korapxml2krill -z --input <directory> --output <filename>
1088
1089Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001090It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001091
Akron941c1a62016-02-23 17:41:41 +01001092=over 2
1093
1094=item B<archive>
1095
Akron081639e2017-04-21 19:01:39 +02001096 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001097
Akron2fd402b2016-10-27 21:26:48 +02001098Converts an archive of KorAP-XML documents. It expects a directory
1099(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001100
1101=item B<extract>
1102
Akrona76d8352016-10-27 16:27:32 +02001103 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1104
1105Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001106
Akron63f20d42017-04-10 23:40:29 +02001107=item B<serial>
1108
1109 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1110
1111Convert archives sequentially. The inputs are not merged but treated
1112as they are (so they may be premerged or globs).
1113the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001114are created based on the archive name. In case the C<--to-tar> flag is given,
1115the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001116
1117
Akron84b53ad2022-01-14 12:39:15 +01001118=item B<slimlog>
1119
1120 $ korapxml2krill slimlog <logfile> > <logfile-slim>
1121
1122Filters out all useless aka succesfull information from logs, to simplify
1123log checks. Expects no further options.
1124
1125
Akron941c1a62016-02-23 17:41:41 +01001126=back
1127
1128
1129=head1 OPTIONS
1130
1131=over 2
1132
Akrona76d8352016-10-27 16:27:32 +02001133=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001134
Akrona76d8352016-10-27 16:27:32 +02001135Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001136
Akron7606afa2016-10-25 16:23:49 +02001137Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001138document, while C<archive> expects a KorAP-XML corpus folder or a zip
1139file to batch process multiple files.
1140C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001141
Akrona76d8352016-10-27 16:27:32 +02001142C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001143that the first archive listed contains all primary data files
1144and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001145
Akron7606afa2016-10-25 16:23:49 +02001146 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001147
Akron821db3d2017-04-06 21:19:31 +02001148Input may also be defined using BSD glob wildcards.
1149
1150 -i 'file/news*.zip'
1151
1152The extended input array will be sorted in length order, so the shortest
1153path needs to contain all primary data files and all meta data files.
1154
Akron0c3e3752016-06-28 15:55:53 +02001155(The directory structure follows the base directory format,
1156that may include a C<.> root folder.
1157In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001158need to be passed with a hash sign in front of the archive's name.
1159This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001160
Akron7606afa2016-10-25 16:23:49 +02001161To support zip files, a version of C<unzip> needs to be installed that is
1162compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001163
Akron7606afa2016-10-25 16:23:49 +02001164B<The root folder switch using the hash sign is experimental and
1165may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001166
Akronf73ffb62018-06-27 12:13:59 +02001167
Akron63f20d42017-04-10 23:40:29 +02001168=item B<--input-base|-ib> <directory>
1169
1170The base directory for inputs.
1171
1172
Akron941c1a62016-02-23 17:41:41 +01001173=item B<--output|-o> <directory|file>
1174
1175Output folder for archive processing or
1176document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001177writes to C<STDOUT> by default
1178(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001179
1180=item B<--overwrite|-w>
1181
1182Overwrite files that already exist.
1183
Akronf73ffb62018-06-27 12:13:59 +02001184
Akron3741f8b2016-12-21 19:55:21 +01001185=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001186
1187Define the default tokenization by specifying
1188the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001189of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001190This will directly take the file instead of running
1191the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001192
Akron3741f8b2016-12-21 19:55:21 +01001193
1194=item B<--base-sentences|-bs> <foundry>#<layer>
1195
1196Define the layer for base sentences.
1197If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001198Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1199layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001200
1201 Defaults to unset.
1202
1203
1204=item B<--base-paragraphs|-bp> <foundry>#<layer>
1205
1206Define the layer for base paragraphs.
1207If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001208Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1209layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001210
1211 Defaults to unset.
1212
1213
Akron41ac10b2017-02-08 22:47:25 +01001214=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1215
1216Define the layer for base pagebreaks.
1217Currently C<DeReKo#Structure> is the only layer supported.
1218
1219 Defaults to unset.
1220
1221
Akron941c1a62016-02-23 17:41:41 +01001222=item B<--skip|-s> <foundry>[#<layer>]
1223
Akronf7ad89e2016-03-16 18:22:47 +01001224Skip specific annotations by specifying the foundry
1225(and optionally the layer with a C<#>-prefix),
1226e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001227Can be set multiple times.
1228
Akronf73ffb62018-06-27 12:13:59 +02001229
Akronc13a1702016-03-15 19:33:14 +01001230=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001231
Akronf7ad89e2016-03-16 18:22:47 +01001232Convert specific annotations by specifying the foundry
1233(and optionally the layer with a C<#>-prefix),
1234e.g. C<Mate> or C<Mate#Morpho>.
1235Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001236
Akronf73ffb62018-06-27 12:13:59 +02001237
Akroned9baf02019-01-22 17:03:25 +01001238=item B<--non-word-tokens|-nwt>
1239
1240Tokenize non-word tokens like word tokens (defined as matching
1241C</[\d\w]/>). Useful to treat punctuations as tokens.
1242
1243 Defaults to unset.
1244
Akronf1849aa2019-12-16 23:35:33 +01001245
1246=item B<--non-verbal-tokens|-nvt>
1247
1248Tokenize non-verbal tokens marked as in the primary data as
1249the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1250
1251 Defaults to unset.
1252
1253
Akron941c1a62016-02-23 17:41:41 +01001254=item B<--jobs|-j>
1255
1256Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001257for archive processing.
Akron11c80302016-03-18 19:44:43 +01001258Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001259
1260If C<sequential-extraction> is not set to false, this will
1261also apply to extraction.
1262
Akronc11f7982017-02-21 21:20:14 +01001263Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001264times the number of available cores, in case L<Sys::Info>
1265is available.
Akronf7ad89e2016-03-16 18:22:47 +01001266This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001267
Akronf73ffb62018-06-27 12:13:59 +02001268
Akron263274c2019-02-07 09:48:30 +01001269=item B<--koral|-k>
1270
1271Version of the output format. Supported versions are:
1272C<0> for legacy serialization, C<0.03> for serialization
1273with metadata fields as key-values on the root object,
1274C<0.4> for serialization with metadata fields as a list
1275of C<"@type":"koral:field"> objects.
1276
1277Currently defaults to C<0.03>.
1278
1279
Akron9ec88872017-04-12 16:29:06 +02001280=item B<--sequential-extraction|-se>
1281
1282Flag to indicate, if the C<jobs> value also applies to extraction.
1283Some systems may have problems with extracting multiple archives
1284to the same folder at the same time.
1285Can be flagged using C<--no-sequential-extraction> as well.
1286Defaults to C<false>.
1287
Akronf73ffb62018-06-27 12:13:59 +02001288
Akron35db6e32016-03-17 22:42:22 +01001289=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001290
Akron35db6e32016-03-17 22:42:22 +01001291Define the metadata parser to use. Defaults to C<I5>.
1292Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1293This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001294
Akronf73ffb62018-06-27 12:13:59 +02001295
Akron941c1a62016-02-23 17:41:41 +01001296=item B<--gzip|-z>
1297
Akronf7ad89e2016-03-16 18:22:47 +01001298Compress the output.
1299Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001300
Akronf73ffb62018-06-27 12:13:59 +02001301
Akron11c80302016-03-18 19:44:43 +01001302=item B<--cache|-c>
1303
1304File to mmap a cache (using L<Cache::FastMmap>).
1305Defaults to C<korapxml2krill.cache> in the calling directory.
1306
Akronf73ffb62018-06-27 12:13:59 +02001307
Akron11c80302016-03-18 19:44:43 +01001308=item B<--cache-size|-cs>
1309
1310Size of the cache. Defaults to C<50m>.
1311
Akronf73ffb62018-06-27 12:13:59 +02001312
Akron11c80302016-03-18 19:44:43 +01001313=item B<--cache-init|-ci>
1314
1315Initialize cache file.
1316Can be flagged using C<--no-cache-init> as well.
1317Defaults to C<true>.
1318
Akronf73ffb62018-06-27 12:13:59 +02001319
Akron11c80302016-03-18 19:44:43 +01001320=item B<--cache-delete|-cd>
1321
1322Delete cache file after processing.
1323Can be flagged using C<--no-cache-delete> as well.
1324Defaults to C<true>.
1325
Akronf73ffb62018-06-27 12:13:59 +02001326
Akron636aa112017-04-07 18:48:56 +02001327=item B<--config|-cfg>
1328
1329Configure the parameters of your call in a file
1330of key-value pairs with whitespace separator
1331
1332 overwrite 1
1333 token DeReKo#Structure
1334 ...
1335
1336Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001337C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001338C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001339C<output>, C<koral>,
Akroneb370a02022-02-24 13:33:40 +01001340C<temporary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001341C<base-sentences>, C<base-paragraphs>,
1342C<base-pagebreaks>,
1343C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001344(semicolon separated), C<anno> (semicolon separated).
1345
Akronf73ffb62018-06-27 12:13:59 +02001346Configuration parameters will always be overwritten by
1347passed parameters.
1348
1349
Akron81500102017-04-07 20:45:44 +02001350=item B<--temporary-extract|-te>
1351
1352Only valid for the C<archive> command.
1353
1354This will first extract all files into a
1355directory and then will archive.
1356If the directory is given as C<:temp:>,
1357a temporary directory is used.
1358This is especially useful to avoid
1359massive unzipping and potential
1360network latency.
Akron636aa112017-04-07 18:48:56 +02001361
Akronf73ffb62018-06-27 12:13:59 +02001362
Akronc93a0802019-07-11 15:48:34 +02001363=item B<--to-tar>
1364
1365Only valid for the C<archive> command.
1366
1367Writes the output into a tar archive.
1368
1369
Akrone10ad322016-02-27 10:54:26 +01001370=item B<--sigle|-sg>
1371
Akron20807582016-10-26 17:11:34 +02001372Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001373Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001374I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001375Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001376In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001377On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001378
Akronf73ffb62018-06-27 12:13:59 +02001379
Akron941c1a62016-02-23 17:41:41 +01001380=item B<--log|-l>
1381
Akronb9c33812020-10-21 16:19:35 +02001382The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001383
Akronf73ffb62018-06-27 12:13:59 +02001384
Akron941c1a62016-02-23 17:41:41 +01001385=item B<--help|-h>
1386
Akron42f48c12020-02-14 13:08:13 +01001387Print help information.
Akron941c1a62016-02-23 17:41:41 +01001388
Akronf73ffb62018-06-27 12:13:59 +02001389
Akron941c1a62016-02-23 17:41:41 +01001390=item B<--version|-v>
1391
1392Print version information.
1393
1394=back
1395
Akronf73ffb62018-06-27 12:13:59 +02001396
Akronc13a1702016-03-15 19:33:14 +01001397=head1 ANNOTATION SUPPORT
1398
1399L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1400developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1401The base foundry with paragraphs, sentences, and the text element are mandatory for
1402L<Krill|https://github.com/KorAP/Krill>.
1403
Akron821db3d2017-04-06 21:19:31 +02001404 Base
1405 #Paragraphs
1406 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001407
Akron821db3d2017-04-06 21:19:31 +02001408 Connexor
1409 #Morpho
1410 #Phrase
1411 #Sentences
1412 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001413
Akron821db3d2017-04-06 21:19:31 +02001414 CoreNLP
1415 #Constituency
1416 #Morpho
1417 #NamedEntities
1418 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001419
Akronce125b62017-06-19 11:54:36 +02001420 CMC
1421 #Morpho
1422
Akron821db3d2017-04-06 21:19:31 +02001423 DeReKo
1424 #Structure
Akronc13a1702016-03-15 19:33:14 +01001425
Akron57510c12019-01-04 14:58:53 +01001426 DGD
1427 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001428 #Structure
Akron57510c12019-01-04 14:58:53 +01001429
Akron821db3d2017-04-06 21:19:31 +02001430 DRuKoLa
1431 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001432
Akron821db3d2017-04-06 21:19:31 +02001433 Glemm
1434 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001435
Akronabb36902021-10-11 15:51:06 +02001436 Gingko
1437 #Morpho
1438
Akronea1aed52018-07-19 14:43:34 +02001439 HNC
1440 #Morpho
1441
Akron4c679192018-01-16 17:41:49 +01001442 LWC
1443 #Dependency
1444
Akron821db3d2017-04-06 21:19:31 +02001445 Malt
1446 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001447
Akron821db3d2017-04-06 21:19:31 +02001448 MarMoT
1449 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001450
Akron821db3d2017-04-06 21:19:31 +02001451 Mate
1452 #Dependency
1453 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001454
Akron821db3d2017-04-06 21:19:31 +02001455 MDParser
1456 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001457
Akrone85a7762022-07-22 08:05:03 +02001458 NKJP
1459 #Morpho
1460 #NamedEntities
1461
Akron821db3d2017-04-06 21:19:31 +02001462 OpenNLP
1463 #Morpho
1464 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001465
Akron07e24772020-04-23 14:00:54 +02001466 RWK
1467 #Morpho
1468 #Structure
1469
Akron821db3d2017-04-06 21:19:31 +02001470 Sgbr
1471 #Lemma
1472 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001473
Akron7d5e6382019-08-08 16:36:27 +02001474 Talismane
1475 #Dependency
1476 #Morpho
1477
Akron821db3d2017-04-06 21:19:31 +02001478 TreeTagger
1479 #Morpho
1480 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001481
Akron821db3d2017-04-06 21:19:31 +02001482 XIP
1483 #Constituency
1484 #Morpho
1485 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001486
Akronc13a1702016-03-15 19:33:14 +01001487
1488More importers are in preparation.
1489New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1490See the built-in annotation importers as examples.
1491
Akronf73ffb62018-06-27 12:13:59 +02001492
Akron41e6c8b2021-10-14 20:22:18 +02001493=head1 METADATA SUPPORT
1494
1495L<KorAP::XML::Krill> has built-in importer for some meta data variants
1496developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1497
1498=over 2
1499
1500=item I5 - Meta data for all I5 files
1501
1502=item Sgbr - Meta data from the Schreibgebrauch project
1503
1504=item Gingko - Meta data from the Gingko project in addition to I5
1505
1506=back
1507
1508More importers are in preparation.
1509New meta data importers can be defined in the C<KorAP::XML::Meta> namespace.
1510See the built-in meta data importers as examples.
1511
1512
Akron8f69d632020-01-15 16:58:11 +01001513=head1 About KorAP-XML
1514
1515KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1516data model (Bański et al. 2013), where text data are stored physically
1517separated from their interpretations (i.e. annotations).
1518A text document in KorAP-XML therefore consists of several files
1519containing primary data, metadata and annotations.
1520
1521The structure of a single KorAP-XML document can be as follows:
1522
1523 - data.xml
1524 - header.xml
1525 + base
1526 - tokens.xml
1527 - ...
1528 + struct
1529 - structure.xml
1530 - ...
1531 + corenlp
1532 - morpho.xml
1533 - constituency.xml
1534 - ...
1535 + tree_tagger
1536 - morpho.xml
1537 - ...
1538 - ...
1539
1540The C<data.xml> contains the primary data, the C<header.xml> contains
1541the metadata, and the annotation layers are stored in subfolders
1542like C<base>, C<struct> or C<corenlp>
1543(so-called "foundries"; Bański et al. 2013).
1544
1545Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001546(Lüngen and Sperberg-McQueen 2012). See the documentation in
1547L<KorAP::XML::Meta::I5> for translatable fields.
1548
1549Annotations correspond to a variant of the TEI-P5 feature structures
1550(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001551Annotation feature structures refer to character sequences of the primary text
1552inside the C<text> element of the C<data.xml>.
1553A single annotation containing the lemma of a token can have the following structure:
1554
1555 <span from="0" to="3">
1556 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1557 <f name="lex">
1558 <fs>
1559 <f name="lemma">zum</f>
1560 </fs>
1561 </f>
1562 </fs>
1563 </span>
1564
1565The C<from> and C<to> attributes are refering to the character span
1566in the primary text.
1567Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1568the structure may vary. See L<KorAP::XML::Annotation::*> for various
1569annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001570
1571Multiple KorAP-XML documents are organized on three levels following
1572the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1573corpus E<gt> document E<gt> text. On each level metadata information
1574can be stored, that C<korapxml2krill> will merge to a single metadata
1575object per text. A corpus is therefore structured as follows:
1576
1577 + <corpus>
1578 - header.xml
1579 + <document>
1580 - header.xml
1581 + <text>
1582 - data.xml
1583 - header.xml
1584 - ...
1585 - ...
1586
1587A single text can be identified by the concatenation of
1588the corpus identifier, the document identifier and the text identifier.
1589This identifier is called the text sigle
1590(e.g. a text with the identifier C<18486> in the document C<060> in the
1591corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1592
1593These corpora are often stored in zip files, with which C<korapxml2krill>
1594can deal with. Corpora may also be split in multiple zip archives
1595(e.g. one zip file per foundry), which is also supported (see C<--input>).
1596
1597Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1598in form of a test suite.
1599The resulting JSON format merges all annotation layers
1600based on a single token stream.
1601
1602=head2 References
1603
1604Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1605KorAP data model: first approximation, December.
1606
1607Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1608"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1609Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1610L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1611
1612Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1613"Robust corpus architecture: a new look at virtual collections and data access",
1614Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1615L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1616
1617Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1618Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1619"Towards an international standard on featurestructure representation",
1620Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1621pp. 373-376.
1622L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1623
1624Harald Lüngen and C. M. Sperberg-McQueen (2012):
1625"A TEI P5 Document Grammar for the IDS Text Model",
1626Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1627L<PDF|https://journals.openedition.org/jtei/pdf/508>
1628
1629TEI Consortium, eds:
1630"Feature Structures",
1631Guidelines for Electronic Text Encoding and Interchange.
1632L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1633
Akron941c1a62016-02-23 17:41:41 +01001634=head1 AVAILABILITY
1635
1636 https://github.com/KorAP/KorAP-XML-Krill
1637
1638
1639=head1 COPYRIGHT AND LICENSE
1640
Akron9a2545e2022-01-16 15:15:50 +01001641Copyright (C) 2015-2022, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001642
Akron6882d7d2021-02-08 09:43:57 +01001643Author: L<Nils Diewald|https://www.nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001644
Akrona76d8352016-10-27 16:27:32 +02001645Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001646
Akron6882d7d2021-02-08 09:43:57 +01001647L<KorAP::XML::Krill> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Akron941c1a62016-02-23 17:41:41 +01001648Corpus Analysis Platform at the
Akron6882d7d2021-02-08 09:43:57 +01001649L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001650member of the
Akronf1849aa2019-12-16 23:35:33 +01001651L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001652
1653This program is free software published under the
Akron6882d7d2021-02-08 09:43:57 +01001654L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akron941c1a62016-02-23 17:41:41 +01001655
1656=cut