blob: 7d246eca5c0681b1e2e4f616f5490537ca5e99b4 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020015use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
Akronf1849aa2019-12-16 23:35:33 +0100146# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100149# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200150#
151# 2020/04/23
152# - Added support for Redewiedergabe-Korpus structure
153# annotations, based on sentence and paragraph milestones
154# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100155# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100156
Akron07e24772020-04-23 14:00:54 +0200157our $LAST_CHANGE = '2020/04/23';
Akron941c1a62016-02-23 17:41:41 +0100158our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100159our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100160our $VERSION_MSG = <<"VERSION";
161Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
162VERSION
163
Akron63f20d42017-04-10 23:40:29 +0200164# Prototypes
Akron41127e32020-08-07 12:46:19 +0200165sub get_file_name($$);
Akron63f20d42017-04-10 23:40:29 +0200166
Akron941c1a62016-02-23 17:41:41 +0100167# Parse comand
168my $cmd;
169our @ARGV;
170if ($ARGV[0] && index($ARGV[0], '-') != 0) {
171 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100172};
Akron63f20d42017-04-10 23:40:29 +0200173my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100174
Akron5f51d422016-08-16 16:26:43 +0200175my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100176my $text;
Akrone10ad322016-02-27 10:54:26 +0100177
Akron941c1a62016-02-23 17:41:41 +0100178# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000179GetOptions(
Akron08385f62016-03-22 20:37:04 +0100180 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200181 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100182 'output|o=s' => \(my $output),
183 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100184 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200185 'token|t=s' => \(my $token_base),
186 'base-sentences|bs=s' => \(my $base_sentences),
187 'base-paragraphs|bp=s' => \(my $base_paragraphs),
188 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100189 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200190 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100191 'skip|s=s' => \@skip,
192 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200193 'cache|c=s' => \(my $cache_file),
194 'config|cfg=s' => \(my $cfg_file),
195 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200196 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100197 'primary|p!' => \(my $primary),
198 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200199 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100200 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200201 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100202 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akronf1849aa2019-12-16 23:35:33 +0100203 'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
Akron9ec88872017-04-12 16:29:06 +0200204 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200205 'cache-size|cs=s' => \(my $cache_size),
206 'cache-delete|cd!' => \(my $cache_delete),
207 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100208 'help|h' => sub {
209 pod2usage(
210 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200211 -verbose => 99,
212 -msg => $VERSION_MSG,
213 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100214 );
215 },
216 'version|v' => sub {
217 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200218 -verbose => 0,
219 -msg => $VERSION_MSG,
220 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100221 )
222 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000223);
224
Akron63f20d42017-04-10 23:40:29 +0200225
Akron636aa112017-04-07 18:48:56 +0200226# Load from configuration
227if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200228 my %config;
229
230 Config::Simple->import_from($cfg_file, \%config);
231
232 # Overwrite
233 if (!defined($overwrite) && defined $config{overwrite}) {
234 $overwrite = $config{overwrite};
235 };
236
237 # Gzip
238 if (!defined($gzip) && defined $config{gzip}) {
239 $gzip = $config{gzip};
240 };
241
242 # Jobs
243 if (!defined($jobs) && defined $config{jobs}) {
244 $jobs = $config{jobs};
245 };
246
Akron263274c2019-02-07 09:48:30 +0100247 # Koral version
248 if (!defined($koral) && defined $config{koral}) {
249 $koral = $config{koral};
250 };
251
Akron63f20d42017-04-10 23:40:29 +0200252 # Input root base directory
253 if (!defined($input_base) && defined $config{'input-base'}) {
254 $input_base = $config{'input-base'};
255 };
256
Akron81500102017-04-07 20:45:44 +0200257 # temporary-extract
258 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
259 $extract_dir = $config{'temporary-extract'};
260 };
261
Akron636aa112017-04-07 18:48:56 +0200262 # Token base
263 if (!defined($token_base) && defined $config{token}) {
264 $token_base = $config{token};
265 };
266
Akronf1849aa2019-12-16 23:35:33 +0100267 # Non-word tokenization
Akroned9baf02019-01-22 17:03:25 +0100268 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
269 $non_word_tokens = $config{'non-word-tokens'};
270 };
271
Akronf1849aa2019-12-16 23:35:33 +0100272 # Non-verbal tokenization
273 if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
274 $non_verbal_tokens = $config{'non-verbal-tokens'};
275 };
276
Akron636aa112017-04-07 18:48:56 +0200277 # Cache file
278 if (!defined($cache_file) && defined $config{cache}) {
279 $cache_file = $config{cache};
280 };
281
282 # Cache size
283 if (!defined($cache_size) && defined $config{'cache-size'}) {
284 $cache_size = $config{'cache-size'};
285 };
286
287 # Cache delete
288 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
289 $cache_delete = $config{'cache-delete'} ;
290 };
291
292 # Cache init
293 if (!(defined $cache_init) && defined $config{'cache-init'}) {
294 $cache_init = $config{'cache-init'} ;
295 };
296
Akron9ec88872017-04-12 16:29:06 +0200297 # Jobs for extraction
298 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
299 $sequential_extraction = $config{'sequential-extraction'} ;
300 };
301
Akron636aa112017-04-07 18:48:56 +0200302 # Meta
303 if (!(defined $meta) && defined $config{'meta'}) {
304 $meta = $config{'meta'} ;
305 };
306
307 # Output
308 if (!(defined $output) && defined $config{'output'}) {
309 $output = $config{'output'} ;
310 };
311
312 # Base-sentences
313 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
314 $base_sentences = $config{'base-sentences'} ;
315 };
316
317 # Base-paragraphs
318 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
319 $base_paragraphs = $config{'base-paragraphs'} ;
320 };
321
322 # Base-pagebreaks
323 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
324 $base_pagebreaks = $config{'base-pagebreaks'} ;
325 };
326
Akron081639e2017-04-21 19:01:39 +0200327 # Write to tar
328 if (!(defined $to_tar) && defined $config{'to-tar'}) {
329 $to_tar = $config{'to-tar'} ;
330 };
331
Akron636aa112017-04-07 18:48:56 +0200332 # Log
333 if (!(defined $log_level) && defined $config{'log'}) {
334 $log_level = $config{'log'} ;
335 };
336
337 # Skip
338 if (!scalar(@skip) && defined $config{'skip'}) {
339 @skip = split /\s*;\s*/, $config{'skip'} ;
340 };
341
342 # Sigle
343 if (!scalar(@sigle) && defined $config{'sigle'}) {
344 @sigle = split /\s*;\s*/, $config{'sigle'} ;
345 };
346
347 # Anno
348 if (!scalar(@anno) && defined $config{'anno'}) {
349 @anno = split /\s*;\s*/, $config{'anno'} ;
350 };
351};
352
Akron63f20d42017-04-10 23:40:29 +0200353
Akron636aa112017-04-07 18:48:56 +0200354# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200355$token_base //= 'OpenNLP#tokens';
356$cache_file //= 'korapxml2krill.cache';
357$cache_size //= '50m';
358$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100359$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200360$cache_delete //= 1;
361$cache_init //= 1;
362$sequential_extraction //= 0;
363$log_level //= 'ERROR';
364$base_sentences //= '';
365$base_paragraphs //= '';
366$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100367$non_word_tokens //= 0;
Akronf1849aa2019-12-16 23:35:33 +0100368$non_verbal_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200369
Akron821db3d2017-04-06 21:19:31 +0200370$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100371$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100372$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100373
Akron63f20d42017-04-10 23:40:29 +0200374
375# Initialize log4perl object
376Log::Log4perl->init({
377 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
378 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
379 'log4perl.appender.STDERR.layout' => 'PatternLayout',
380 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
381});
382
383my $log = Log::Log4perl->get_logger('main');
384
385
386print "Reading config from $cfg_file\n" if $cfg_file;
387
388
Akron941c1a62016-02-23 17:41:41 +0100389my %ERROR_HASH = (
390 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200391 -verbose => 99,
392 -msg => $VERSION_MSG,
393 -output => '-',
394 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100395);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000396
Akron941c1a62016-02-23 17:41:41 +0100397# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100398pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000399
Akrone1dbc382016-07-08 22:24:52 +0200400# Gzip has no effect, if no output is given
401pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000402
Akronc11f7982017-02-21 21:20:14 +0100403
Akron636aa112017-04-07 18:48:56 +0200404if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100405 state $cores = Sys::Info->new->device('CPU')->count;
406 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200407 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100408};
409
Akron821db3d2017-04-06 21:19:31 +0200410
Akron63f20d42017-04-10 23:40:29 +0200411# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200412if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200413
Akron486f9ab2017-04-22 23:25:19 +0200414 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200415 $log->error("Directory '$output' does not exist.");
416 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200417 };
418
419 # Remove all inputs
420 my $remove_next = 0;
421 @keep_argv = @{c(@keep_argv)->grep(
422 sub {
423 # Input flag
424 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
425 $remove_next = 1;
426 return 0;
427 }
428
429 # input value
430 elsif ($remove_next) {
431 $remove_next = 0;
432 return 0;
433 };
434
435 # Pass parameter
436 return 1;
437 }
438 )->to_array};
439
440
441 # Iterate over all inputs
442 foreach (@input) {
443
Akron081639e2017-04-21 19:01:39 +0200444 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200445 my $new_out = catdir($output, get_file_name_from_glob($_));
446
Akron486f9ab2017-04-22 23:25:19 +0200447 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200448 unless ($to_tar) {
449 if (make_path($new_out) == 0 && !-d $new_out) {
450 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200451 exit 1;
Akron081639e2017-04-21 19:01:39 +0200452 };
Akron63f20d42017-04-10 23:40:29 +0200453 };
454
455 # Create archive command
456 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
457 print "Start serial processing of $_ to $new_out\n";
458
459 # Start archiving
460 system @archive_cmd;
461 };
462
Akron3abc03e2017-06-29 16:23:35 +0200463 exit;
Akron63f20d42017-04-10 23:40:29 +0200464};
465
Akrone1dbc382016-07-08 22:24:52 +0200466my %skip;
467$skip{lc($_)} = 1 foreach @skip;
468
469my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100470push(@layers, ['Base', 'Sentences']) unless $base_sentences;
471push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200472
473# Connexor
474push(@layers, ['Connexor', 'Morpho']);
475push(@layers, ['Connexor', 'Syntax']);
476push(@layers, ['Connexor', 'Phrase']);
477push(@layers, ['Connexor', 'Sentences']);
478
479# CoreNLP
480push(@layers, ['CoreNLP', 'NamedEntities']);
481push(@layers, ['CoreNLP', 'Sentences']);
482push(@layers, ['CoreNLP', 'Morpho']);
483push(@layers, ['CoreNLP', 'Constituency']);
484
Akronce125b62017-06-19 11:54:36 +0200485# CMC
486push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100487
Akrone1dbc382016-07-08 22:24:52 +0200488# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100489my @dereko_attr = ();
490if ($base_sentences eq 'dereko#structure') {
491 push @dereko_attr, 'sentences';
492};
493if ($base_paragraphs eq 'dereko#structure') {
494 push @dereko_attr, 'paragraphs';
495};
Akron636bd9c2017-02-09 17:13:00 +0100496
Akron41ac10b2017-02-08 22:47:25 +0100497if ($base_pagebreaks eq 'dereko#structure') {
498 push @dereko_attr, 'pagebreaks';
499};
500
501if ($dereko_attr[0]) {
502 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100503}
504else {
505 push(@layers, ['DeReKo', 'Structure']);
506};
Akrone1dbc382016-07-08 22:24:52 +0200507
Akron57510c12019-01-04 14:58:53 +0100508# DGD
509push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100510if ($base_sentences eq 'dgd#structure') {
511 push(@layers, ['DGD', 'Structure', 'base-sentence']);
512}
Akron57510c12019-01-04 14:58:53 +0100513
514# DRuKoLa
515push(@layers, ['DRuKoLa', 'Morpho']);
516
Akrone1dbc382016-07-08 22:24:52 +0200517# Glemm
518push(@layers, ['Glemm', 'Morpho']);
519
Akronea1aed52018-07-19 14:43:34 +0200520# HNC
521push(@layers, ['HNC', 'Morpho']);
522
Akron4c679192018-01-16 17:41:49 +0100523# LWC
524push(@layers, ['LWC', 'Dependency']);
525
Akrone1dbc382016-07-08 22:24:52 +0200526# Malt
527push(@layers, ['Malt', 'Dependency']);
528
Akron57510c12019-01-04 14:58:53 +0100529# Marmot
530push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200531
532# Mate
533push(@layers, ['Mate', 'Morpho']);
534push(@layers, ['Mate', 'Dependency']);
535
Akron57510c12019-01-04 14:58:53 +0100536# MDParser
537push(@layers, ['MDParser', 'Dependency']);
538
Akrone1dbc382016-07-08 22:24:52 +0200539# OpenNLP
540push(@layers, ['OpenNLP', 'Morpho']);
541push(@layers, ['OpenNLP', 'Sentences']);
542
Akron07e24772020-04-23 14:00:54 +0200543# Redewiedergabe
544push(@layers, ['RWK', 'Morpho']);
545if ($base_sentences eq 'rwk#structure') {
546 push(@layers, ['RWK', 'Structure']);
547};
548
Akrone1dbc382016-07-08 22:24:52 +0200549# Schreibgebrauch
550push(@layers, ['Sgbr', 'Lemma']);
551push(@layers, ['Sgbr', 'Morpho']);
552
Akron7d5e6382019-08-08 16:36:27 +0200553# Talismane
554push(@layers, ['Talismane', 'Dependency']);
555push(@layers, ['Talismane', 'Morpho']);
556
Akrone1dbc382016-07-08 22:24:52 +0200557# TreeTagger
558push(@layers, ['TreeTagger', 'Morpho']);
559push(@layers, ['TreeTagger', 'Sentences']);
560
561# XIP
562push(@layers, ['XIP', 'Morpho']);
563push(@layers, ['XIP', 'Constituency']);
564push(@layers, ['XIP', 'Sentences']);
565push(@layers, ['XIP', 'Dependency']);
566
Akron4fa37c32017-01-20 14:43:10 +0100567
Akrone1dbc382016-07-08 22:24:52 +0200568# Check filters
569my @filtered_anno;
570if ($skip{'#all'}) {
571 foreach (@anno) {
572 push @filtered_anno, [ split('#', $_) ];
573 };
574}
575
576# Add all annotations that are not skipped
577else {
578 # Add to index file - respect skipping
579 foreach my $info (@layers) {
580 # Skip if Foundry or Foundry#Layer should be skipped
581 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
582 push @filtered_anno, $info;
583 };
584 };
585};
586
587# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200588my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
589
590# Remove file extension
591$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200592
593# TODO: This should not be initialized for batch
594my $cache = Cache::FastMmap->new(
595 share_file => $cache_file,
596 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200597 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200598);
599
Akron03b24db2016-08-16 20:54:32 +0200600# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200601my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200602 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200603 meta_type => $meta,
604 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200605 foundry => $token_base_foundry,
606 layer => $token_base_layer,
607 gzip => $gzip,
608 log => $log,
Akron263274c2019-02-07 09:48:30 +0100609 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200610 primary => $primary,
611 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100612 anno => \@filtered_anno,
Akronf1849aa2019-12-16 23:35:33 +0100613 non_word_tokens => $non_word_tokens,
614 non_verbal_tokens => $non_verbal_tokens
Akrone1dbc382016-07-08 22:24:52 +0200615);
616
Akrone10ad322016-02-27 10:54:26 +0100617# Convert sigle to path construct
618s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
619
Akron7d4cdd82016-08-17 21:39:45 +0200620if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200621 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200622 $log->error("Directory '$output' does not exist.");
623 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200624 };
625};
626
Akron63f20d42017-04-10 23:40:29 +0200627
628# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200629if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200630
Akron821db3d2017-04-06 21:19:31 +0200631 my @new_input = ();
632
633 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200634 foreach my $wild_card (@input) {
635
636 # Prefix with input root
637 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
638
639 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200640 };
641
Akron63f20d42017-04-10 23:40:29 +0200642 # Sort files by length
643 @input = sort { length($a) <=> length($b) } @new_input;
644
645 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200646};
647
648
Akron941c1a62016-02-23 17:41:41 +0100649# Process a single file
650unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100651 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000652
Akron941c1a62016-02-23 17:41:41 +0100653 BEGIN {
654 $main::TIME = Benchmark->new;
655 $main::LAST_STOP = Benchmark->new;
656 };
657
658 sub stop_time {
659 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200660 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100661 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200662 timestr(timediff($new, $main::LAST_STOP)) .
663 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
664 );
Akron941c1a62016-02-23 17:41:41 +0100665 $main::LAST_STOP = $new;
666 };
667
668 # Create and parse new document
669 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100670
Akron7d4cdd82016-08-17 21:39:45 +0200671 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200672 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100673
Akron11c80302016-03-18 19:44:43 +0100674 # Delete cache file
675 unlink($cache_file) if $cache_delete;
676
Akron5f51d422016-08-16 16:26:43 +0200677 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200678 exit;
Akron81500102017-04-07 20:45:44 +0200679};
680
Nils Diewald59094f22014-11-05 18:20:50 +0000681
Akrone10ad322016-02-27 10:54:26 +0100682# Extract XML files
Akron81500102017-04-07 20:45:44 +0200683if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100684
Akrond5643ad2017-07-04 20:27:13 +0200685 # Output is required
686 pod2usage(%ERROR_HASH) unless $output;
687
Akron7d4cdd82016-08-17 21:39:45 +0200688 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200689 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100690
Akron7d4cdd82016-08-17 21:39:45 +0200691 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100692 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200693 $log->error("Unzip is not installed or incompatible.");
694 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100695 };
696
Akronb0c88db2016-06-29 16:33:18 +0200697 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200698 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200699
Akron31a08cb2019-02-20 20:43:26 +0100700 # Will set @sigle
701 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200702
Akrone10ad322016-02-27 10:54:26 +0100703 # Iterate over all given sigles and extract
704 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100705
Akron2812ba22016-10-28 21:55:59 +0200706 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200707
Akron03b24db2016-08-16 20:54:32 +0200708 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200709 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100710
Akron955b75b2019-02-21 14:28:41 +0100711 # TODO:
712 # - prefix???
713 $archive->extract_sigle([$_], $output, $jobs)
714 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200715 );
Akrone10ad322016-02-27 10:54:26 +0100716 print "extracted.\n";
717 };
Akronb0c88db2016-06-29 16:33:18 +0200718 }
Akron7d4cdd82016-08-17 21:39:45 +0200719
720 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200721 else {
722 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200723 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100724 };
725}
726
Akron81500102017-04-07 20:45:44 +0200727
Akron941c1a62016-02-23 17:41:41 +0100728# Process an archive
729elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000730
Akron81500102017-04-07 20:45:44 +0200731 my $archive_output;
732
733 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100734 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200735
736 # Create new archive object
737 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
738
739 # Check zip capabilities
740 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200741 $log->error("Unzip is not installed or incompatible.");
742 exit 1;
Akron81500102017-04-07 20:45:44 +0200743 };
744
745 # Add further annotation archived
746 $archive->attach($_) foreach @input[1..$#input];
747
748 # Create a temporary directory
749 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200750 $extract_dir = tempdir(CLEANUP => 0);
751 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200752 };
753
Akron63f20d42017-04-10 23:40:29 +0200754 # Add some random extra to avoid clashes with multiple archives
755 $extract_dir = catdir($extract_dir, random_string('cccccc'));
756
Akron31a08cb2019-02-20 20:43:26 +0100757 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200758 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200759 @input = ($extract_dir);
760 }
761 else {
762 $log->error('Unable to extract from primary archive ' . $input[0] .
763 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200764 exit 1;
Akron81500102017-04-07 20:45:44 +0200765 };
766 }
767
768 # Can't create archive object
769 else {
770 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200771 exit 1;
Akron81500102017-04-07 20:45:44 +0200772 };
773 };
774
Akron7d4cdd82016-08-17 21:39:45 +0200775 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100776 my $pool = Parallel::ForkManager->new($jobs);
777
Akron7d4cdd82016-08-17 21:39:45 +0200778 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100779 my $iter = 1; # Current text in process
780
Akronda3097e2017-04-23 19:53:57 +0200781 my $tar_archive;
782 my $output_dir = $output;
783 my $tar_fh;
784
785 # Initialize tar archive
786 if ($to_tar) {
787 $tar_archive = Archive::Tar::Builder->new(
788 ignore_errors => 1
789 );
790
791 # Set output name
792 my $tar_file = $output;
793 unless ($tar_file =~ /\.tar$/) {
794 $tar_file .= '.tar';
795 };
796
797 # Initiate the tar file
798 print "Writing to file $tar_file\n";
799 $tar_fh = IO::File->new($tar_file, 'w');
800 $tar_fh->binmode(1);
801
802 # Set handle
803 $tar_archive->set_handle($tar_fh);
804
805 # Output to temporary directory
806 $output_dir = File::Temp->newdir;
807 };
808
Akron941c1a62016-02-23 17:41:41 +0100809 # Report on fork message
810 $pool->run_on_finish (
811 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200812 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100813 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200814
Akron08385f62016-03-22 20:37:04 +0100815 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200816 ($iter++) . "/$count]" .
817 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200818 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200819
820 if (!$code && $to_tar && $data->[2]) {
821 my $filename = $data->[2];
822
823 # Lock filehandle
824 if (flock($tar_fh, LOCK_EX)) {
825
Akron9a062ce2017-07-04 19:12:05 +0200826 my $clean_file = fileparse($filename);
827
Akronda3097e2017-04-23 19:53:57 +0200828 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200829 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200830 unlink $filename;
831
832 # Unlock filehandle
833 flock($tar_fh, LOCK_UN);
834 }
835 else {
836 $log->warn("Unable to add $filename to archive");
837 };
838 };
839
Akron4c0cf312016-10-15 16:42:09 +0200840 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100841 }
842 );
843
844 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200845 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100846 print "Reading data ...\n";
847
Akron7d4cdd82016-08-17 21:39:45 +0200848 # unless (Cache::FastMmap->new(
849 # share_file => $cache_file,
850 # cache_size => $cache_size,
851 # init_file => $cache_init
852 # )) {
853 # print "Unable to intialize cache '$cache_file'\n\n";
854 # exit(1);
855 # };
Akron11c80302016-03-18 19:44:43 +0100856
Akron486f9ab2017-04-22 23:25:19 +0200857
Akron941c1a62016-02-23 17:41:41 +0100858 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100859 if (-d $input[0]) {
860 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100861 my @dirs;
862 my $dir;
863
Akron7d4cdd82016-08-17 21:39:45 +0200864 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100865 while (1) {
866 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200867 push @dirs, $dir;
868 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100869 };
870 last unless $it->next;
871 };
872
873 print "Start processing ...\n";
874 $t = Benchmark->new;
875 $count = scalar @dirs;
876
877 DIRECTORY_LOOP:
878 for (my $i = 0; $i < $count; $i++) {
879
Akrone1dbc382016-07-08 22:24:52 +0200880 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200881 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200882 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200883 );
Akron941c1a62016-02-23 17:41:41 +0100884
885 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200886 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200887
Akron13d56622016-10-31 14:54:49 +0100888 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200889 $pool->finish(
890 0,
Akronda3097e2017-04-23 19:53:57 +0200891 [
892 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
893 undef,
894 $filename
895 ]
Akron486f9ab2017-04-22 23:25:19 +0200896 );
Akron3ec48972016-08-17 23:24:52 +0200897 }
898 else {
Akron4c0cf312016-10-15 16:42:09 +0200899 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200900 };
Akron941c1a62016-02-23 17:41:41 +0100901 };
902 }
903
904 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200905 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200906
Akron941c1a62016-02-23 17:41:41 +0100907 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200908 $log->error("Unzip is not installed or incompatible.");
909 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100910 };
911
Akron08385f62016-03-22 20:37:04 +0100912 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200913 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100914
Akron31a08cb2019-02-20 20:43:26 +0100915 # Get sigles to extract
916 my $prefix = set_sigle($archive);
917
Akron941c1a62016-02-23 17:41:41 +0100918 print "Start processing ...\n";
919 $t = Benchmark->new;
920 my @dirs = $archive->list_texts;
921 $count = scalar @dirs;
922
923 ARCHIVE_LOOP:
924 for (my $i = 0; $i < $count; $i++) {
925
926 # Split path information
927 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
928
Akrone1dbc382016-07-08 22:24:52 +0200929 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200930 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200931 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200932 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200933 catfile($corpus, $doc, $text)
934 . '.json' . ($gzip ? '.gz' : '')
935 )
Akrone1dbc382016-07-08 22:24:52 +0200936 );
Akron941c1a62016-02-23 17:41:41 +0100937
938 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200939 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100940
Akron4c0cf312016-10-15 16:42:09 +0200941 # Create temporary file
942 $temp = File::Temp->newdir;
943
Akronbdf434a2016-10-24 17:42:07 +0200944 # TODO: Check if $filename exist at the beginning,
945 # because extraction can be horrible slow!
946
Akron941c1a62016-02-23 17:41:41 +0100947 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100948 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100949
Akron7d4cdd82016-08-17 21:39:45 +0200950 # Create corpus directory
951 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100952
Akron7d4cdd82016-08-17 21:39:45 +0200953 # Temporary directory
954 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100955
Akron7d4cdd82016-08-17 21:39:45 +0200956 # Write file
Akron13d56622016-10-31 14:54:49 +0100957 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200958
Akron4c0cf312016-10-15 16:42:09 +0200959 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100960 $pool->finish(
961 0,
Akronda3097e2017-04-23 19:53:57 +0200962 [
963 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
964 $temp,
965 $filename
966 ]
Akron13d56622016-10-31 14:54:49 +0100967 );
968 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200969 }
970 else {
Akron4c0cf312016-10-15 16:42:09 +0200971 # Delete temporary file
972 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200973 };
Akron941c1a62016-02-23 17:41:41 +0100974 }
Akron7d4cdd82016-08-17 21:39:45 +0200975
976 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100977 else {
Akron4c0cf312016-10-15 16:42:09 +0200978 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100979 };
980 };
981 }
982
983 else {
984 print "Input is neither a directory nor an archive.\n\n";
985 };
986
987 $pool->wait_all_children;
988
Akron11c80302016-03-18 19:44:43 +0100989 # Delete cache file
990 unlink($cache_file) if $cache_delete;
991
Akronda3097e2017-04-23 19:53:57 +0200992 # Close tar filehandle
993 if ($to_tar && $tar_fh) {
994 $tar_archive->finish;
995 $tar_fh->close;
996 print "Wrote to tar archive.\n";
997 };
998
Akron63f20d42017-04-10 23:40:29 +0200999 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001000 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001001};
Akron941c1a62016-02-23 17:41:41 +01001002
Nils Diewald2db9ad02013-10-29 19:26:43 +00001003
Akron31a08cb2019-02-20 20:43:26 +01001004# For an archive, this will create the list
1005# of all sigles to process
1006sub set_sigle {
1007 my $archive = shift;
1008
1009 my $prefix = 1;
1010 my @dirs = ();
1011
1012 # No sigles given
1013 unless (@sigle) {
1014
1015 # Get files
1016 foreach ($archive->list_texts) {
1017
1018 push @dirs, $_;
1019
1020 # Split path information
1021 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1022
1023 # TODO: Make this OS independent
1024 push @sigle, join '/', $corpus, $doc, $text;
1025 };
1026 }
1027
1028 # Check sigle for doc sigles
1029 else {
1030 my @new_sigle;
1031
1032 my $prefix_check = 0;
1033
1034 # Iterate over all sigle
1035 foreach (@sigle) {
1036
1037 # Sigle is a doc sigle
1038 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1039
1040 print "$_ ...";
1041 # Check if a prefix is needed
1042 unless ($prefix_check) {
1043
1044 if ($prefix = $archive->check_prefix) {
1045 print " with prefix ...";
1046 };
1047 $prefix_check = 1;
1048 };
1049
1050 print "\n";
1051
Akron31a08cb2019-02-20 20:43:26 +01001052 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001053 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1054 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001055 );
1056 print "extracted.\n";
1057 }
1058
1059 # Sigle is a text sigle
1060 else {
1061 push @new_sigle, $_;
1062
1063 unless ($prefix_check) {
1064
1065 if ($prefix = $archive->check_prefix) {
1066 print " with prefix ...";
1067 };
1068 $prefix_check = 1;
1069 };
1070 };
1071 };
1072 @sigle = @new_sigle;
1073 };
1074
1075 return $prefix;
1076};
1077
1078
1079
Akron63f20d42017-04-10 23:40:29 +02001080# Cleanup temporary extraction directory
1081if ($extract_dir) {
1082 my $objects = remove_tree($extract_dir, { safe => 1 });
1083 print "Removed directory $extract_dir with $objects objects.\n";
1084};
1085
1086
1087print "\n";
1088
Nils Diewald2db9ad02013-10-29 19:26:43 +00001089__END__
Akron941c1a62016-02-23 17:41:41 +01001090
1091=pod
1092
1093=encoding utf8
1094
1095=head1 NAME
1096
Akron42f48c12020-02-14 13:08:13 +01001097korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001098
1099
1100=head1 SYNOPSIS
1101
Akrona76d8352016-10-27 16:27:32 +02001102 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001103
Akron2fd402b2016-10-27 21:26:48 +02001104
Akron941c1a62016-02-23 17:41:41 +01001105=head1 DESCRIPTION
1106
1107L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1108compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001109The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001110
1111
1112=head1 INSTALLATION
1113
1114The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1115
Akronaf386982016-10-12 00:33:25 +02001116 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001117
Akronc13a1702016-03-15 19:33:14 +01001118In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001119be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001120Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001121In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001122
1123=head1 ARGUMENTS
1124
Akrona76d8352016-10-27 16:27:32 +02001125 $ korapxml2krill -z --input <directory> --output <filename>
1126
1127Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001128It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001129
Akron941c1a62016-02-23 17:41:41 +01001130=over 2
1131
1132=item B<archive>
1133
Akron081639e2017-04-21 19:01:39 +02001134 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001135
Akron2fd402b2016-10-27 21:26:48 +02001136Converts an archive of KorAP-XML documents. It expects a directory
1137(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001138
1139=item B<extract>
1140
Akrona76d8352016-10-27 16:27:32 +02001141 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1142
1143Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001144
Akron63f20d42017-04-10 23:40:29 +02001145=item B<serial>
1146
1147 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1148
1149Convert archives sequentially. The inputs are not merged but treated
1150as they are (so they may be premerged or globs).
1151the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001152are created based on the archive name. In case the C<--to-tar> flag is given,
1153the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001154
1155
Akron941c1a62016-02-23 17:41:41 +01001156=back
1157
1158
1159=head1 OPTIONS
1160
1161=over 2
1162
Akrona76d8352016-10-27 16:27:32 +02001163=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001164
Akrona76d8352016-10-27 16:27:32 +02001165Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001166
Akron7606afa2016-10-25 16:23:49 +02001167Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001168document, while C<archive> expects a KorAP-XML corpus folder or a zip
1169file to batch process multiple files.
1170C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001171
Akrona76d8352016-10-27 16:27:32 +02001172C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001173that the first archive listed contains all primary data files
1174and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001175
Akron7606afa2016-10-25 16:23:49 +02001176 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001177
Akron821db3d2017-04-06 21:19:31 +02001178Input may also be defined using BSD glob wildcards.
1179
1180 -i 'file/news*.zip'
1181
1182The extended input array will be sorted in length order, so the shortest
1183path needs to contain all primary data files and all meta data files.
1184
Akron0c3e3752016-06-28 15:55:53 +02001185(The directory structure follows the base directory format,
1186that may include a C<.> root folder.
1187In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001188need to be passed with a hash sign in front of the archive's name.
1189This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001190
Akron7606afa2016-10-25 16:23:49 +02001191To support zip files, a version of C<unzip> needs to be installed that is
1192compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001193
Akron7606afa2016-10-25 16:23:49 +02001194B<The root folder switch using the hash sign is experimental and
1195may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001196
Akronf73ffb62018-06-27 12:13:59 +02001197
Akron63f20d42017-04-10 23:40:29 +02001198=item B<--input-base|-ib> <directory>
1199
1200The base directory for inputs.
1201
1202
Akron941c1a62016-02-23 17:41:41 +01001203=item B<--output|-o> <directory|file>
1204
1205Output folder for archive processing or
1206document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001207writes to C<STDOUT> by default
1208(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001209
1210=item B<--overwrite|-w>
1211
1212Overwrite files that already exist.
1213
Akronf73ffb62018-06-27 12:13:59 +02001214
Akron3741f8b2016-12-21 19:55:21 +01001215=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001216
1217Define the default tokenization by specifying
1218the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001219of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001220This will directly take the file instead of running
1221the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001222
Akron3741f8b2016-12-21 19:55:21 +01001223
1224=item B<--base-sentences|-bs> <foundry>#<layer>
1225
1226Define the layer for base sentences.
1227If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001228Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1229layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001230
1231 Defaults to unset.
1232
1233
1234=item B<--base-paragraphs|-bp> <foundry>#<layer>
1235
1236Define the layer for base paragraphs.
1237If given, this will be used instead of using C<Base#Paragraphs>.
1238Currently C<DeReKo#Structure> is the only additional layer supported.
1239
1240 Defaults to unset.
1241
1242
Akron41ac10b2017-02-08 22:47:25 +01001243=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1244
1245Define the layer for base pagebreaks.
1246Currently C<DeReKo#Structure> is the only layer supported.
1247
1248 Defaults to unset.
1249
1250
Akron941c1a62016-02-23 17:41:41 +01001251=item B<--skip|-s> <foundry>[#<layer>]
1252
Akronf7ad89e2016-03-16 18:22:47 +01001253Skip specific annotations by specifying the foundry
1254(and optionally the layer with a C<#>-prefix),
1255e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001256Can be set multiple times.
1257
Akronf73ffb62018-06-27 12:13:59 +02001258
Akronc13a1702016-03-15 19:33:14 +01001259=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001260
Akronf7ad89e2016-03-16 18:22:47 +01001261Convert specific annotations by specifying the foundry
1262(and optionally the layer with a C<#>-prefix),
1263e.g. C<Mate> or C<Mate#Morpho>.
1264Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001265
Akronf73ffb62018-06-27 12:13:59 +02001266
Akron941c1a62016-02-23 17:41:41 +01001267=item B<--primary|-p>
1268
Akronc13a1702016-03-15 19:33:14 +01001269Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001270Can be flagged using C<--no-primary> as well.
1271This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001272
Akronf73ffb62018-06-27 12:13:59 +02001273
Akroned9baf02019-01-22 17:03:25 +01001274=item B<--non-word-tokens|-nwt>
1275
1276Tokenize non-word tokens like word tokens (defined as matching
1277C</[\d\w]/>). Useful to treat punctuations as tokens.
1278
1279 Defaults to unset.
1280
Akronf1849aa2019-12-16 23:35:33 +01001281
1282=item B<--non-verbal-tokens|-nvt>
1283
1284Tokenize non-verbal tokens marked as in the primary data as
1285the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1286
1287 Defaults to unset.
1288
1289
Akron941c1a62016-02-23 17:41:41 +01001290=item B<--jobs|-j>
1291
1292Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001293for archive processing.
Akron11c80302016-03-18 19:44:43 +01001294Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001295
1296If C<sequential-extraction> is not set to false, this will
1297also apply to extraction.
1298
Akronc11f7982017-02-21 21:20:14 +01001299Pass -1, and the value will be set automatically to 5
1300times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001301This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001302
Akronf73ffb62018-06-27 12:13:59 +02001303
Akron263274c2019-02-07 09:48:30 +01001304=item B<--koral|-k>
1305
1306Version of the output format. Supported versions are:
1307C<0> for legacy serialization, C<0.03> for serialization
1308with metadata fields as key-values on the root object,
1309C<0.4> for serialization with metadata fields as a list
1310of C<"@type":"koral:field"> objects.
1311
1312Currently defaults to C<0.03>.
1313
1314
Akron9ec88872017-04-12 16:29:06 +02001315=item B<--sequential-extraction|-se>
1316
1317Flag to indicate, if the C<jobs> value also applies to extraction.
1318Some systems may have problems with extracting multiple archives
1319to the same folder at the same time.
1320Can be flagged using C<--no-sequential-extraction> as well.
1321Defaults to C<false>.
1322
Akronf73ffb62018-06-27 12:13:59 +02001323
Akron35db6e32016-03-17 22:42:22 +01001324=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001325
Akron35db6e32016-03-17 22:42:22 +01001326Define the metadata parser to use. Defaults to C<I5>.
1327Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1328This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akron941c1a62016-02-23 17:41:41 +01001331=item B<--pretty|-y>
1332
Akronc13a1702016-03-15 19:33:14 +01001333Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001334This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001335
Akronf73ffb62018-06-27 12:13:59 +02001336
Akron941c1a62016-02-23 17:41:41 +01001337=item B<--gzip|-z>
1338
Akronf7ad89e2016-03-16 18:22:47 +01001339Compress the output.
1340Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001341
Akronf73ffb62018-06-27 12:13:59 +02001342
Akron11c80302016-03-18 19:44:43 +01001343=item B<--cache|-c>
1344
1345File to mmap a cache (using L<Cache::FastMmap>).
1346Defaults to C<korapxml2krill.cache> in the calling directory.
1347
Akronf73ffb62018-06-27 12:13:59 +02001348
Akron11c80302016-03-18 19:44:43 +01001349=item B<--cache-size|-cs>
1350
1351Size of the cache. Defaults to C<50m>.
1352
Akronf73ffb62018-06-27 12:13:59 +02001353
Akron11c80302016-03-18 19:44:43 +01001354=item B<--cache-init|-ci>
1355
1356Initialize cache file.
1357Can be flagged using C<--no-cache-init> as well.
1358Defaults to C<true>.
1359
Akronf73ffb62018-06-27 12:13:59 +02001360
Akron11c80302016-03-18 19:44:43 +01001361=item B<--cache-delete|-cd>
1362
1363Delete cache file after processing.
1364Can be flagged using C<--no-cache-delete> as well.
1365Defaults to C<true>.
1366
Akronf73ffb62018-06-27 12:13:59 +02001367
Akron636aa112017-04-07 18:48:56 +02001368=item B<--config|-cfg>
1369
1370Configure the parameters of your call in a file
1371of key-value pairs with whitespace separator
1372
1373 overwrite 1
1374 token DeReKo#Structure
1375 ...
1376
1377Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001378C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001379C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001380C<output>, C<koral>,
1381C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001382C<base-sentences>, C<base-paragraphs>,
1383C<base-pagebreaks>,
1384C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001385(semicolon separated), C<anno> (semicolon separated).
1386
Akronf73ffb62018-06-27 12:13:59 +02001387Configuration parameters will always be overwritten by
1388passed parameters.
1389
1390
Akron81500102017-04-07 20:45:44 +02001391=item B<--temporary-extract|-te>
1392
1393Only valid for the C<archive> command.
1394
1395This will first extract all files into a
1396directory and then will archive.
1397If the directory is given as C<:temp:>,
1398a temporary directory is used.
1399This is especially useful to avoid
1400massive unzipping and potential
1401network latency.
Akron636aa112017-04-07 18:48:56 +02001402
Akronf73ffb62018-06-27 12:13:59 +02001403
Akronc93a0802019-07-11 15:48:34 +02001404=item B<--to-tar>
1405
1406Only valid for the C<archive> command.
1407
1408Writes the output into a tar archive.
1409
1410
Akrone10ad322016-02-27 10:54:26 +01001411=item B<--sigle|-sg>
1412
Akron20807582016-10-26 17:11:34 +02001413Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001414Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001415I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001416Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001417In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001418On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001419
Akronf73ffb62018-06-27 12:13:59 +02001420
Akron941c1a62016-02-23 17:41:41 +01001421=item B<--log|-l>
1422
1423The L<Log4perl> log level, defaults to C<ERROR>.
1424
Akronf73ffb62018-06-27 12:13:59 +02001425
Akron941c1a62016-02-23 17:41:41 +01001426=item B<--help|-h>
1427
Akron42f48c12020-02-14 13:08:13 +01001428Print help information.
Akron941c1a62016-02-23 17:41:41 +01001429
Akronf73ffb62018-06-27 12:13:59 +02001430
Akron941c1a62016-02-23 17:41:41 +01001431=item B<--version|-v>
1432
1433Print version information.
1434
1435=back
1436
Akronf73ffb62018-06-27 12:13:59 +02001437
Akronc13a1702016-03-15 19:33:14 +01001438=head1 ANNOTATION SUPPORT
1439
1440L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1441developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1442The base foundry with paragraphs, sentences, and the text element are mandatory for
1443L<Krill|https://github.com/KorAP/Krill>.
1444
Akron821db3d2017-04-06 21:19:31 +02001445 Base
1446 #Paragraphs
1447 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001448
Akron821db3d2017-04-06 21:19:31 +02001449 Connexor
1450 #Morpho
1451 #Phrase
1452 #Sentences
1453 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001454
Akron821db3d2017-04-06 21:19:31 +02001455 CoreNLP
1456 #Constituency
1457 #Morpho
1458 #NamedEntities
1459 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001460
Akronce125b62017-06-19 11:54:36 +02001461 CMC
1462 #Morpho
1463
Akron821db3d2017-04-06 21:19:31 +02001464 DeReKo
1465 #Structure
Akronc13a1702016-03-15 19:33:14 +01001466
Akron57510c12019-01-04 14:58:53 +01001467 DGD
1468 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001469 #Structure
Akron57510c12019-01-04 14:58:53 +01001470
Akron821db3d2017-04-06 21:19:31 +02001471 DRuKoLa
1472 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001473
Akron821db3d2017-04-06 21:19:31 +02001474 Glemm
1475 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001476
Akronea1aed52018-07-19 14:43:34 +02001477 HNC
1478 #Morpho
1479
Akron4c679192018-01-16 17:41:49 +01001480 LWC
1481 #Dependency
1482
Akron821db3d2017-04-06 21:19:31 +02001483 Malt
1484 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001485
Akron821db3d2017-04-06 21:19:31 +02001486 MarMoT
1487 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001488
Akron821db3d2017-04-06 21:19:31 +02001489 Mate
1490 #Dependency
1491 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001492
Akron821db3d2017-04-06 21:19:31 +02001493 MDParser
1494 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001495
Akron821db3d2017-04-06 21:19:31 +02001496 OpenNLP
1497 #Morpho
1498 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001499
Akron07e24772020-04-23 14:00:54 +02001500 RWK
1501 #Morpho
1502 #Structure
1503
Akron821db3d2017-04-06 21:19:31 +02001504 Sgbr
1505 #Lemma
1506 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001507
Akron7d5e6382019-08-08 16:36:27 +02001508 Talismane
1509 #Dependency
1510 #Morpho
1511
Akron821db3d2017-04-06 21:19:31 +02001512 TreeTagger
1513 #Morpho
1514 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001515
Akron821db3d2017-04-06 21:19:31 +02001516 XIP
1517 #Constituency
1518 #Morpho
1519 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001520
Akronc13a1702016-03-15 19:33:14 +01001521
1522More importers are in preparation.
1523New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1524See the built-in annotation importers as examples.
1525
Akronf73ffb62018-06-27 12:13:59 +02001526
Akron8f69d632020-01-15 16:58:11 +01001527=head1 About KorAP-XML
1528
1529KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1530data model (Bański et al. 2013), where text data are stored physically
1531separated from their interpretations (i.e. annotations).
1532A text document in KorAP-XML therefore consists of several files
1533containing primary data, metadata and annotations.
1534
1535The structure of a single KorAP-XML document can be as follows:
1536
1537 - data.xml
1538 - header.xml
1539 + base
1540 - tokens.xml
1541 - ...
1542 + struct
1543 - structure.xml
1544 - ...
1545 + corenlp
1546 - morpho.xml
1547 - constituency.xml
1548 - ...
1549 + tree_tagger
1550 - morpho.xml
1551 - ...
1552 - ...
1553
1554The C<data.xml> contains the primary data, the C<header.xml> contains
1555the metadata, and the annotation layers are stored in subfolders
1556like C<base>, C<struct> or C<corenlp>
1557(so-called "foundries"; Bański et al. 2013).
1558
1559Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001560(Lüngen and Sperberg-McQueen 2012). See the documentation in
1561L<KorAP::XML::Meta::I5> for translatable fields.
1562
1563Annotations correspond to a variant of the TEI-P5 feature structures
1564(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001565Annotation feature structures refer to character sequences of the primary text
1566inside the C<text> element of the C<data.xml>.
1567A single annotation containing the lemma of a token can have the following structure:
1568
1569 <span from="0" to="3">
1570 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1571 <f name="lex">
1572 <fs>
1573 <f name="lemma">zum</f>
1574 </fs>
1575 </f>
1576 </fs>
1577 </span>
1578
1579The C<from> and C<to> attributes are refering to the character span
1580in the primary text.
1581Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1582the structure may vary. See L<KorAP::XML::Annotation::*> for various
1583annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001584
1585Multiple KorAP-XML documents are organized on three levels following
1586the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1587corpus E<gt> document E<gt> text. On each level metadata information
1588can be stored, that C<korapxml2krill> will merge to a single metadata
1589object per text. A corpus is therefore structured as follows:
1590
1591 + <corpus>
1592 - header.xml
1593 + <document>
1594 - header.xml
1595 + <text>
1596 - data.xml
1597 - header.xml
1598 - ...
1599 - ...
1600
1601A single text can be identified by the concatenation of
1602the corpus identifier, the document identifier and the text identifier.
1603This identifier is called the text sigle
1604(e.g. a text with the identifier C<18486> in the document C<060> in the
1605corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1606
1607These corpora are often stored in zip files, with which C<korapxml2krill>
1608can deal with. Corpora may also be split in multiple zip archives
1609(e.g. one zip file per foundry), which is also supported (see C<--input>).
1610
1611Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1612in form of a test suite.
1613The resulting JSON format merges all annotation layers
1614based on a single token stream.
1615
1616=head2 References
1617
1618Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1619KorAP data model: first approximation, December.
1620
1621Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1622"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1623Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1624L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1625
1626Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1627"Robust corpus architecture: a new look at virtual collections and data access",
1628Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1629L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1630
1631Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1632Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1633"Towards an international standard on featurestructure representation",
1634Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1635pp. 373-376.
1636L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1637
1638Harald Lüngen and C. M. Sperberg-McQueen (2012):
1639"A TEI P5 Document Grammar for the IDS Text Model",
1640Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1641L<PDF|https://journals.openedition.org/jtei/pdf/508>
1642
1643TEI Consortium, eds:
1644"Feature Structures",
1645Guidelines for Electronic Text Encoding and Interchange.
1646L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1647
Akron941c1a62016-02-23 17:41:41 +01001648=head1 AVAILABILITY
1649
1650 https://github.com/KorAP/KorAP-XML-Krill
1651
1652
1653=head1 COPYRIGHT AND LICENSE
1654
Akron8f69d632020-01-15 16:58:11 +01001655Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001656
Akron8f69d632020-01-15 16:58:11 +01001657Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001658
Akrona76d8352016-10-27 16:27:32 +02001659Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001660
1661L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1662Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001663L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001664member of the
Akronf1849aa2019-12-16 23:35:33 +01001665L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001666
1667This program is free software published under the
1668L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1669
1670=cut