blob: 54edaa8285037ce350ded320c45b20c26bd7ab6e [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
Akronf1849aa2019-12-16 23:35:33 +0100146# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100149# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200150#
151# 2020/04/23
152# - Added support for Redewiedergabe-Korpus structure
153# annotations, based on sentence and paragraph milestones
154# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100155# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100156
Akron07e24772020-04-23 14:00:54 +0200157our $LAST_CHANGE = '2020/04/23';
Akron941c1a62016-02-23 17:41:41 +0100158our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100159our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100160our $VERSION_MSG = <<"VERSION";
161Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
162VERSION
163
Akron63f20d42017-04-10 23:40:29 +0200164# Prototypes
165sub get_file_name_from_glob($);
166sub get_file_name($);
167
Akron941c1a62016-02-23 17:41:41 +0100168# Parse comand
169my $cmd;
170our @ARGV;
171if ($ARGV[0] && index($ARGV[0], '-') != 0) {
172 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100173};
Akron63f20d42017-04-10 23:40:29 +0200174my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100175
Akron5f51d422016-08-16 16:26:43 +0200176my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100177my $text;
Akrone10ad322016-02-27 10:54:26 +0100178
Akron941c1a62016-02-23 17:41:41 +0100179# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000180GetOptions(
Akron08385f62016-03-22 20:37:04 +0100181 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200182 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100183 'output|o=s' => \(my $output),
184 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100185 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200186 'token|t=s' => \(my $token_base),
187 'base-sentences|bs=s' => \(my $base_sentences),
188 'base-paragraphs|bp=s' => \(my $base_paragraphs),
189 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100190 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200191 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100192 'skip|s=s' => \@skip,
193 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200194 'cache|c=s' => \(my $cache_file),
195 'config|cfg=s' => \(my $cfg_file),
196 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200197 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100198 'primary|p!' => \(my $primary),
199 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200200 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100201 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200202 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100203 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akronf1849aa2019-12-16 23:35:33 +0100204 'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
Akron9ec88872017-04-12 16:29:06 +0200205 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200206 'cache-size|cs=s' => \(my $cache_size),
207 'cache-delete|cd!' => \(my $cache_delete),
208 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100209 'help|h' => sub {
210 pod2usage(
211 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200212 -verbose => 99,
213 -msg => $VERSION_MSG,
214 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100215 );
216 },
217 'version|v' => sub {
218 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200219 -verbose => 0,
220 -msg => $VERSION_MSG,
221 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100222 )
223 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000224);
225
Akron63f20d42017-04-10 23:40:29 +0200226
Akron636aa112017-04-07 18:48:56 +0200227# Load from configuration
228if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200229 my %config;
230
231 Config::Simple->import_from($cfg_file, \%config);
232
233 # Overwrite
234 if (!defined($overwrite) && defined $config{overwrite}) {
235 $overwrite = $config{overwrite};
236 };
237
238 # Gzip
239 if (!defined($gzip) && defined $config{gzip}) {
240 $gzip = $config{gzip};
241 };
242
243 # Jobs
244 if (!defined($jobs) && defined $config{jobs}) {
245 $jobs = $config{jobs};
246 };
247
Akron263274c2019-02-07 09:48:30 +0100248 # Koral version
249 if (!defined($koral) && defined $config{koral}) {
250 $koral = $config{koral};
251 };
252
Akron63f20d42017-04-10 23:40:29 +0200253 # Input root base directory
254 if (!defined($input_base) && defined $config{'input-base'}) {
255 $input_base = $config{'input-base'};
256 };
257
Akron81500102017-04-07 20:45:44 +0200258 # temporary-extract
259 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
260 $extract_dir = $config{'temporary-extract'};
261 };
262
Akron636aa112017-04-07 18:48:56 +0200263 # Token base
264 if (!defined($token_base) && defined $config{token}) {
265 $token_base = $config{token};
266 };
267
Akronf1849aa2019-12-16 23:35:33 +0100268 # Non-word tokenization
Akroned9baf02019-01-22 17:03:25 +0100269 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
270 $non_word_tokens = $config{'non-word-tokens'};
271 };
272
Akronf1849aa2019-12-16 23:35:33 +0100273 # Non-verbal tokenization
274 if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
275 $non_verbal_tokens = $config{'non-verbal-tokens'};
276 };
277
Akron636aa112017-04-07 18:48:56 +0200278 # Cache file
279 if (!defined($cache_file) && defined $config{cache}) {
280 $cache_file = $config{cache};
281 };
282
283 # Cache size
284 if (!defined($cache_size) && defined $config{'cache-size'}) {
285 $cache_size = $config{'cache-size'};
286 };
287
288 # Cache delete
289 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
290 $cache_delete = $config{'cache-delete'} ;
291 };
292
293 # Cache init
294 if (!(defined $cache_init) && defined $config{'cache-init'}) {
295 $cache_init = $config{'cache-init'} ;
296 };
297
Akron9ec88872017-04-12 16:29:06 +0200298 # Jobs for extraction
299 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
300 $sequential_extraction = $config{'sequential-extraction'} ;
301 };
302
Akron636aa112017-04-07 18:48:56 +0200303 # Meta
304 if (!(defined $meta) && defined $config{'meta'}) {
305 $meta = $config{'meta'} ;
306 };
307
308 # Output
309 if (!(defined $output) && defined $config{'output'}) {
310 $output = $config{'output'} ;
311 };
312
313 # Base-sentences
314 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
315 $base_sentences = $config{'base-sentences'} ;
316 };
317
318 # Base-paragraphs
319 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
320 $base_paragraphs = $config{'base-paragraphs'} ;
321 };
322
323 # Base-pagebreaks
324 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
325 $base_pagebreaks = $config{'base-pagebreaks'} ;
326 };
327
Akron081639e2017-04-21 19:01:39 +0200328 # Write to tar
329 if (!(defined $to_tar) && defined $config{'to-tar'}) {
330 $to_tar = $config{'to-tar'} ;
331 };
332
Akron636aa112017-04-07 18:48:56 +0200333 # Log
334 if (!(defined $log_level) && defined $config{'log'}) {
335 $log_level = $config{'log'} ;
336 };
337
338 # Skip
339 if (!scalar(@skip) && defined $config{'skip'}) {
340 @skip = split /\s*;\s*/, $config{'skip'} ;
341 };
342
343 # Sigle
344 if (!scalar(@sigle) && defined $config{'sigle'}) {
345 @sigle = split /\s*;\s*/, $config{'sigle'} ;
346 };
347
348 # Anno
349 if (!scalar(@anno) && defined $config{'anno'}) {
350 @anno = split /\s*;\s*/, $config{'anno'} ;
351 };
352};
353
Akron63f20d42017-04-10 23:40:29 +0200354
Akron636aa112017-04-07 18:48:56 +0200355# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200356$token_base //= 'OpenNLP#tokens';
357$cache_file //= 'korapxml2krill.cache';
358$cache_size //= '50m';
359$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100360$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200361$cache_delete //= 1;
362$cache_init //= 1;
363$sequential_extraction //= 0;
364$log_level //= 'ERROR';
365$base_sentences //= '';
366$base_paragraphs //= '';
367$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100368$non_word_tokens //= 0;
Akronf1849aa2019-12-16 23:35:33 +0100369$non_verbal_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200370
Akron821db3d2017-04-06 21:19:31 +0200371$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100372$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100373$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100374
Akron63f20d42017-04-10 23:40:29 +0200375
376# Initialize log4perl object
377Log::Log4perl->init({
378 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
379 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
380 'log4perl.appender.STDERR.layout' => 'PatternLayout',
381 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
382});
383
384my $log = Log::Log4perl->get_logger('main');
385
386
387print "Reading config from $cfg_file\n" if $cfg_file;
388
389
Akron941c1a62016-02-23 17:41:41 +0100390my %ERROR_HASH = (
391 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200392 -verbose => 99,
393 -msg => $VERSION_MSG,
394 -output => '-',
395 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100396);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000397
Akron941c1a62016-02-23 17:41:41 +0100398# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100399pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000400
Akrone1dbc382016-07-08 22:24:52 +0200401# Gzip has no effect, if no output is given
402pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000403
Akronc11f7982017-02-21 21:20:14 +0100404
Akron636aa112017-04-07 18:48:56 +0200405if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100406 state $cores = Sys::Info->new->device('CPU')->count;
407 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200408 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100409};
410
Akron821db3d2017-04-06 21:19:31 +0200411
Akron63f20d42017-04-10 23:40:29 +0200412# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200413if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200414
Akron486f9ab2017-04-22 23:25:19 +0200415 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200416 $log->error("Directory '$output' does not exist.");
417 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200418 };
419
420 # Remove all inputs
421 my $remove_next = 0;
422 @keep_argv = @{c(@keep_argv)->grep(
423 sub {
424 # Input flag
425 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
426 $remove_next = 1;
427 return 0;
428 }
429
430 # input value
431 elsif ($remove_next) {
432 $remove_next = 0;
433 return 0;
434 };
435
436 # Pass parameter
437 return 1;
438 }
439 )->to_array};
440
441
442 # Iterate over all inputs
443 foreach (@input) {
444
Akron081639e2017-04-21 19:01:39 +0200445 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200446 my $new_out = catdir($output, get_file_name_from_glob($_));
447
Akron486f9ab2017-04-22 23:25:19 +0200448 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200449 unless ($to_tar) {
450 if (make_path($new_out) == 0 && !-d $new_out) {
451 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200452 exit 1;
Akron081639e2017-04-21 19:01:39 +0200453 };
Akron63f20d42017-04-10 23:40:29 +0200454 };
455
456 # Create archive command
457 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
458 print "Start serial processing of $_ to $new_out\n";
459
460 # Start archiving
461 system @archive_cmd;
462 };
463
Akron3abc03e2017-06-29 16:23:35 +0200464 exit;
Akron63f20d42017-04-10 23:40:29 +0200465};
466
Akrone1dbc382016-07-08 22:24:52 +0200467my %skip;
468$skip{lc($_)} = 1 foreach @skip;
469
470my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100471push(@layers, ['Base', 'Sentences']) unless $base_sentences;
472push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200473
474# Connexor
475push(@layers, ['Connexor', 'Morpho']);
476push(@layers, ['Connexor', 'Syntax']);
477push(@layers, ['Connexor', 'Phrase']);
478push(@layers, ['Connexor', 'Sentences']);
479
480# CoreNLP
481push(@layers, ['CoreNLP', 'NamedEntities']);
482push(@layers, ['CoreNLP', 'Sentences']);
483push(@layers, ['CoreNLP', 'Morpho']);
484push(@layers, ['CoreNLP', 'Constituency']);
485
Akronce125b62017-06-19 11:54:36 +0200486# CMC
487push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100488
Akrone1dbc382016-07-08 22:24:52 +0200489# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100490my @dereko_attr = ();
491if ($base_sentences eq 'dereko#structure') {
492 push @dereko_attr, 'sentences';
493};
494if ($base_paragraphs eq 'dereko#structure') {
495 push @dereko_attr, 'paragraphs';
496};
Akron636bd9c2017-02-09 17:13:00 +0100497
Akron41ac10b2017-02-08 22:47:25 +0100498if ($base_pagebreaks eq 'dereko#structure') {
499 push @dereko_attr, 'pagebreaks';
500};
501
502if ($dereko_attr[0]) {
503 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100504}
505else {
506 push(@layers, ['DeReKo', 'Structure']);
507};
Akrone1dbc382016-07-08 22:24:52 +0200508
Akron57510c12019-01-04 14:58:53 +0100509# DGD
510push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100511if ($base_sentences eq 'dgd#structure') {
512 push(@layers, ['DGD', 'Structure', 'base-sentence']);
513}
Akron57510c12019-01-04 14:58:53 +0100514
515# DRuKoLa
516push(@layers, ['DRuKoLa', 'Morpho']);
517
Akrone1dbc382016-07-08 22:24:52 +0200518# Glemm
519push(@layers, ['Glemm', 'Morpho']);
520
Akronea1aed52018-07-19 14:43:34 +0200521# HNC
522push(@layers, ['HNC', 'Morpho']);
523
Akron4c679192018-01-16 17:41:49 +0100524# LWC
525push(@layers, ['LWC', 'Dependency']);
526
Akrone1dbc382016-07-08 22:24:52 +0200527# Malt
528push(@layers, ['Malt', 'Dependency']);
529
Akron57510c12019-01-04 14:58:53 +0100530# Marmot
531push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200532
533# Mate
534push(@layers, ['Mate', 'Morpho']);
535push(@layers, ['Mate', 'Dependency']);
536
Akron57510c12019-01-04 14:58:53 +0100537# MDParser
538push(@layers, ['MDParser', 'Dependency']);
539
Akrone1dbc382016-07-08 22:24:52 +0200540# OpenNLP
541push(@layers, ['OpenNLP', 'Morpho']);
542push(@layers, ['OpenNLP', 'Sentences']);
543
Akron07e24772020-04-23 14:00:54 +0200544# Redewiedergabe
545push(@layers, ['RWK', 'Morpho']);
546if ($base_sentences eq 'rwk#structure') {
547 push(@layers, ['RWK', 'Structure']);
548};
549
Akrone1dbc382016-07-08 22:24:52 +0200550# Schreibgebrauch
551push(@layers, ['Sgbr', 'Lemma']);
552push(@layers, ['Sgbr', 'Morpho']);
553
Akron7d5e6382019-08-08 16:36:27 +0200554# Talismane
555push(@layers, ['Talismane', 'Dependency']);
556push(@layers, ['Talismane', 'Morpho']);
557
Akrone1dbc382016-07-08 22:24:52 +0200558# TreeTagger
559push(@layers, ['TreeTagger', 'Morpho']);
560push(@layers, ['TreeTagger', 'Sentences']);
561
562# XIP
563push(@layers, ['XIP', 'Morpho']);
564push(@layers, ['XIP', 'Constituency']);
565push(@layers, ['XIP', 'Sentences']);
566push(@layers, ['XIP', 'Dependency']);
567
Akron4fa37c32017-01-20 14:43:10 +0100568
Akrone1dbc382016-07-08 22:24:52 +0200569# Check filters
570my @filtered_anno;
571if ($skip{'#all'}) {
572 foreach (@anno) {
573 push @filtered_anno, [ split('#', $_) ];
574 };
575}
576
577# Add all annotations that are not skipped
578else {
579 # Add to index file - respect skipping
580 foreach my $info (@layers) {
581 # Skip if Foundry or Foundry#Layer should be skipped
582 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
583 push @filtered_anno, $info;
584 };
585 };
586};
587
588# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200589my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
590
591# Remove file extension
592$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200593
594# TODO: This should not be initialized for batch
595my $cache = Cache::FastMmap->new(
596 share_file => $cache_file,
597 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200598 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200599);
600
Akron03b24db2016-08-16 20:54:32 +0200601# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200602my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200603 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200604 meta_type => $meta,
605 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200606 foundry => $token_base_foundry,
607 layer => $token_base_layer,
608 gzip => $gzip,
609 log => $log,
Akron263274c2019-02-07 09:48:30 +0100610 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200611 primary => $primary,
612 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100613 anno => \@filtered_anno,
Akronf1849aa2019-12-16 23:35:33 +0100614 non_word_tokens => $non_word_tokens,
615 non_verbal_tokens => $non_verbal_tokens
Akrone1dbc382016-07-08 22:24:52 +0200616);
617
Akron941c1a62016-02-23 17:41:41 +0100618# Get file name based on path information
619sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100620 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200621 if (-d $i) {
622 $i =~ s![^\/]+$!!;
623 };
Akron941c1a62016-02-23 17:41:41 +0100624 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200625
626 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200627 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100628 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100629 $file =~ tr/\//-/;
630 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200631 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100632 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000633};
634
Akron63f20d42017-04-10 23:40:29 +0200635
636sub get_file_name_from_glob ($) {
637 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200638 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200639 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
640 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
641 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
642 $glob =~ s/^-//; # Clean beginning
643 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200644 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200645 return $glob;
646};
647
648
Akrone10ad322016-02-27 10:54:26 +0100649# Convert sigle to path construct
650s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
651
Akron7d4cdd82016-08-17 21:39:45 +0200652if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200653 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200654 $log->error("Directory '$output' does not exist.");
655 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200656 };
657};
658
Akron63f20d42017-04-10 23:40:29 +0200659
660# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200661if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200662
Akron821db3d2017-04-06 21:19:31 +0200663 my @new_input = ();
664
665 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200666 foreach my $wild_card (@input) {
667
668 # Prefix with input root
669 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
670
671 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200672 };
673
Akron63f20d42017-04-10 23:40:29 +0200674 # Sort files by length
675 @input = sort { length($a) <=> length($b) } @new_input;
676
677 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200678};
679
680
Akron941c1a62016-02-23 17:41:41 +0100681# Process a single file
682unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100683 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000684
Akron941c1a62016-02-23 17:41:41 +0100685 BEGIN {
686 $main::TIME = Benchmark->new;
687 $main::LAST_STOP = Benchmark->new;
688 };
689
690 sub stop_time {
691 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200692 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100693 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200694 timestr(timediff($new, $main::LAST_STOP)) .
695 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
696 );
Akron941c1a62016-02-23 17:41:41 +0100697 $main::LAST_STOP = $new;
698 };
699
700 # Create and parse new document
701 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100702
Akron7d4cdd82016-08-17 21:39:45 +0200703 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200704 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100705
Akron11c80302016-03-18 19:44:43 +0100706 # Delete cache file
707 unlink($cache_file) if $cache_delete;
708
Akron5f51d422016-08-16 16:26:43 +0200709 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200710 exit;
Akron81500102017-04-07 20:45:44 +0200711};
712
Nils Diewald59094f22014-11-05 18:20:50 +0000713
Akrone10ad322016-02-27 10:54:26 +0100714# Extract XML files
Akron81500102017-04-07 20:45:44 +0200715if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100716
Akrond5643ad2017-07-04 20:27:13 +0200717 # Output is required
718 pod2usage(%ERROR_HASH) unless $output;
719
Akron7d4cdd82016-08-17 21:39:45 +0200720 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200721 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100722
Akron7d4cdd82016-08-17 21:39:45 +0200723 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100724 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200725 $log->error("Unzip is not installed or incompatible.");
726 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100727 };
728
Akronb0c88db2016-06-29 16:33:18 +0200729 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200730 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200731
Akron31a08cb2019-02-20 20:43:26 +0100732 # Will set @sigle
733 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200734
Akrone10ad322016-02-27 10:54:26 +0100735 # Iterate over all given sigles and extract
736 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100737
Akron2812ba22016-10-28 21:55:59 +0200738 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200739
Akron03b24db2016-08-16 20:54:32 +0200740 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200741 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100742
Akron955b75b2019-02-21 14:28:41 +0100743 # TODO:
744 # - prefix???
745 $archive->extract_sigle([$_], $output, $jobs)
746 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200747 );
Akrone10ad322016-02-27 10:54:26 +0100748 print "extracted.\n";
749 };
Akronb0c88db2016-06-29 16:33:18 +0200750 }
Akron7d4cdd82016-08-17 21:39:45 +0200751
752 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200753 else {
754 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200755 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100756 };
757}
758
Akron81500102017-04-07 20:45:44 +0200759
Akron941c1a62016-02-23 17:41:41 +0100760# Process an archive
761elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000762
Akron81500102017-04-07 20:45:44 +0200763 my $archive_output;
764
765 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100766 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200767
768 # Create new archive object
769 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
770
771 # Check zip capabilities
772 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200773 $log->error("Unzip is not installed or incompatible.");
774 exit 1;
Akron81500102017-04-07 20:45:44 +0200775 };
776
777 # Add further annotation archived
778 $archive->attach($_) foreach @input[1..$#input];
779
780 # Create a temporary directory
781 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200782 $extract_dir = tempdir(CLEANUP => 0);
783 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200784 };
785
Akron63f20d42017-04-10 23:40:29 +0200786 # Add some random extra to avoid clashes with multiple archives
787 $extract_dir = catdir($extract_dir, random_string('cccccc'));
788
Akron31a08cb2019-02-20 20:43:26 +0100789 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200790 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200791 @input = ($extract_dir);
792 }
793 else {
794 $log->error('Unable to extract from primary archive ' . $input[0] .
795 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200796 exit 1;
Akron81500102017-04-07 20:45:44 +0200797 };
798 }
799
800 # Can't create archive object
801 else {
802 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200803 exit 1;
Akron81500102017-04-07 20:45:44 +0200804 };
805 };
806
Akron7d4cdd82016-08-17 21:39:45 +0200807 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100808 my $pool = Parallel::ForkManager->new($jobs);
809
Akron7d4cdd82016-08-17 21:39:45 +0200810 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100811 my $iter = 1; # Current text in process
812
Akronda3097e2017-04-23 19:53:57 +0200813 my $tar_archive;
814 my $output_dir = $output;
815 my $tar_fh;
816
817 # Initialize tar archive
818 if ($to_tar) {
819 $tar_archive = Archive::Tar::Builder->new(
820 ignore_errors => 1
821 );
822
823 # Set output name
824 my $tar_file = $output;
825 unless ($tar_file =~ /\.tar$/) {
826 $tar_file .= '.tar';
827 };
828
829 # Initiate the tar file
830 print "Writing to file $tar_file\n";
831 $tar_fh = IO::File->new($tar_file, 'w');
832 $tar_fh->binmode(1);
833
834 # Set handle
835 $tar_archive->set_handle($tar_fh);
836
837 # Output to temporary directory
838 $output_dir = File::Temp->newdir;
839 };
840
Akron941c1a62016-02-23 17:41:41 +0100841 # Report on fork message
842 $pool->run_on_finish (
843 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200844 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100845 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200846
Akron08385f62016-03-22 20:37:04 +0100847 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200848 ($iter++) . "/$count]" .
849 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200850 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200851
852 if (!$code && $to_tar && $data->[2]) {
853 my $filename = $data->[2];
854
855 # Lock filehandle
856 if (flock($tar_fh, LOCK_EX)) {
857
Akron9a062ce2017-07-04 19:12:05 +0200858 my $clean_file = fileparse($filename);
859
Akronda3097e2017-04-23 19:53:57 +0200860 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200861 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200862 unlink $filename;
863
864 # Unlock filehandle
865 flock($tar_fh, LOCK_UN);
866 }
867 else {
868 $log->warn("Unable to add $filename to archive");
869 };
870 };
871
Akron4c0cf312016-10-15 16:42:09 +0200872 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100873 }
874 );
875
876 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200877 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100878 print "Reading data ...\n";
879
Akron7d4cdd82016-08-17 21:39:45 +0200880 # unless (Cache::FastMmap->new(
881 # share_file => $cache_file,
882 # cache_size => $cache_size,
883 # init_file => $cache_init
884 # )) {
885 # print "Unable to intialize cache '$cache_file'\n\n";
886 # exit(1);
887 # };
Akron11c80302016-03-18 19:44:43 +0100888
Akron486f9ab2017-04-22 23:25:19 +0200889
Akron941c1a62016-02-23 17:41:41 +0100890 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100891 if (-d $input[0]) {
892 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100893 my @dirs;
894 my $dir;
895
Akron7d4cdd82016-08-17 21:39:45 +0200896 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100897 while (1) {
898 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200899 push @dirs, $dir;
900 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100901 };
902 last unless $it->next;
903 };
904
905 print "Start processing ...\n";
906 $t = Benchmark->new;
907 $count = scalar @dirs;
908
909 DIRECTORY_LOOP:
910 for (my $i = 0; $i < $count; $i++) {
911
Akrone1dbc382016-07-08 22:24:52 +0200912 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200913 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200914 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200915 );
Akron941c1a62016-02-23 17:41:41 +0100916
917 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200918 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200919
Akron13d56622016-10-31 14:54:49 +0100920 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200921 $pool->finish(
922 0,
Akronda3097e2017-04-23 19:53:57 +0200923 [
924 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
925 undef,
926 $filename
927 ]
Akron486f9ab2017-04-22 23:25:19 +0200928 );
Akron3ec48972016-08-17 23:24:52 +0200929 }
930 else {
Akron4c0cf312016-10-15 16:42:09 +0200931 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200932 };
Akron941c1a62016-02-23 17:41:41 +0100933 };
934 }
935
936 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200937 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200938
Akron941c1a62016-02-23 17:41:41 +0100939 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200940 $log->error("Unzip is not installed or incompatible.");
941 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100942 };
943
Akron08385f62016-03-22 20:37:04 +0100944 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200945 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100946
Akron31a08cb2019-02-20 20:43:26 +0100947 # Get sigles to extract
948 my $prefix = set_sigle($archive);
949
Akron941c1a62016-02-23 17:41:41 +0100950 print "Start processing ...\n";
951 $t = Benchmark->new;
952 my @dirs = $archive->list_texts;
953 $count = scalar @dirs;
954
955 ARCHIVE_LOOP:
956 for (my $i = 0; $i < $count; $i++) {
957
958 # Split path information
959 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
960
Akrone1dbc382016-07-08 22:24:52 +0200961 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200962 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200963 get_file_name(
964 catfile($corpus, $doc, $text)
965 . '.json' . ($gzip ? '.gz' : '')
966 )
Akrone1dbc382016-07-08 22:24:52 +0200967 );
Akron941c1a62016-02-23 17:41:41 +0100968
969 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200970 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100971
Akron4c0cf312016-10-15 16:42:09 +0200972 # Create temporary file
973 $temp = File::Temp->newdir;
974
Akronbdf434a2016-10-24 17:42:07 +0200975 # TODO: Check if $filename exist at the beginning,
976 # because extraction can be horrible slow!
977
Akron941c1a62016-02-23 17:41:41 +0100978 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100979 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100980
Akron7d4cdd82016-08-17 21:39:45 +0200981 # Create corpus directory
982 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100983
Akron7d4cdd82016-08-17 21:39:45 +0200984 # Temporary directory
985 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100986
Akron7d4cdd82016-08-17 21:39:45 +0200987 # Write file
Akron13d56622016-10-31 14:54:49 +0100988 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200989
Akron4c0cf312016-10-15 16:42:09 +0200990 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100991 $pool->finish(
992 0,
Akronda3097e2017-04-23 19:53:57 +0200993 [
994 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
995 $temp,
996 $filename
997 ]
Akron13d56622016-10-31 14:54:49 +0100998 );
999 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001000 }
1001 else {
Akron4c0cf312016-10-15 16:42:09 +02001002 # Delete temporary file
1003 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001004 };
Akron941c1a62016-02-23 17:41:41 +01001005 }
Akron7d4cdd82016-08-17 21:39:45 +02001006
1007 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001008 else {
Akron4c0cf312016-10-15 16:42:09 +02001009 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001010 };
1011 };
1012 }
1013
1014 else {
1015 print "Input is neither a directory nor an archive.\n\n";
1016 };
1017
1018 $pool->wait_all_children;
1019
Akron11c80302016-03-18 19:44:43 +01001020 # Delete cache file
1021 unlink($cache_file) if $cache_delete;
1022
Akronda3097e2017-04-23 19:53:57 +02001023 # Close tar filehandle
1024 if ($to_tar && $tar_fh) {
1025 $tar_archive->finish;
1026 $tar_fh->close;
1027 print "Wrote to tar archive.\n";
1028 };
1029
Akron63f20d42017-04-10 23:40:29 +02001030 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001031 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001032};
Akron941c1a62016-02-23 17:41:41 +01001033
Nils Diewald2db9ad02013-10-29 19:26:43 +00001034
Akron31a08cb2019-02-20 20:43:26 +01001035# For an archive, this will create the list
1036# of all sigles to process
1037sub set_sigle {
1038 my $archive = shift;
1039
1040 my $prefix = 1;
1041 my @dirs = ();
1042
1043 # No sigles given
1044 unless (@sigle) {
1045
1046 # Get files
1047 foreach ($archive->list_texts) {
1048
1049 push @dirs, $_;
1050
1051 # Split path information
1052 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1053
1054 # TODO: Make this OS independent
1055 push @sigle, join '/', $corpus, $doc, $text;
1056 };
1057 }
1058
1059 # Check sigle for doc sigles
1060 else {
1061 my @new_sigle;
1062
1063 my $prefix_check = 0;
1064
1065 # Iterate over all sigle
1066 foreach (@sigle) {
1067
1068 # Sigle is a doc sigle
1069 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1070
1071 print "$_ ...";
1072 # Check if a prefix is needed
1073 unless ($prefix_check) {
1074
1075 if ($prefix = $archive->check_prefix) {
1076 print " with prefix ...";
1077 };
1078 $prefix_check = 1;
1079 };
1080
1081 print "\n";
1082
Akron31a08cb2019-02-20 20:43:26 +01001083 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001084 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1085 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001086 );
1087 print "extracted.\n";
1088 }
1089
1090 # Sigle is a text sigle
1091 else {
1092 push @new_sigle, $_;
1093
1094 unless ($prefix_check) {
1095
1096 if ($prefix = $archive->check_prefix) {
1097 print " with prefix ...";
1098 };
1099 $prefix_check = 1;
1100 };
1101 };
1102 };
1103 @sigle = @new_sigle;
1104 };
1105
1106 return $prefix;
1107};
1108
1109
1110
Akron63f20d42017-04-10 23:40:29 +02001111# Cleanup temporary extraction directory
1112if ($extract_dir) {
1113 my $objects = remove_tree($extract_dir, { safe => 1 });
1114 print "Removed directory $extract_dir with $objects objects.\n";
1115};
1116
1117
1118print "\n";
1119
Nils Diewald2db9ad02013-10-29 19:26:43 +00001120__END__
Akron941c1a62016-02-23 17:41:41 +01001121
1122=pod
1123
1124=encoding utf8
1125
1126=head1 NAME
1127
Akron42f48c12020-02-14 13:08:13 +01001128korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001129
1130
1131=head1 SYNOPSIS
1132
Akrona76d8352016-10-27 16:27:32 +02001133 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001134
Akron2fd402b2016-10-27 21:26:48 +02001135
Akron941c1a62016-02-23 17:41:41 +01001136=head1 DESCRIPTION
1137
1138L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1139compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001140The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001141
1142
1143=head1 INSTALLATION
1144
1145The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1146
Akronaf386982016-10-12 00:33:25 +02001147 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001148
Akronc13a1702016-03-15 19:33:14 +01001149In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001150be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001151Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001152In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001153
1154=head1 ARGUMENTS
1155
Akrona76d8352016-10-27 16:27:32 +02001156 $ korapxml2krill -z --input <directory> --output <filename>
1157
1158Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001159It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001160
Akron941c1a62016-02-23 17:41:41 +01001161=over 2
1162
1163=item B<archive>
1164
Akron081639e2017-04-21 19:01:39 +02001165 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001166
Akron2fd402b2016-10-27 21:26:48 +02001167Converts an archive of KorAP-XML documents. It expects a directory
1168(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001169
1170=item B<extract>
1171
Akrona76d8352016-10-27 16:27:32 +02001172 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1173
1174Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001175
Akron63f20d42017-04-10 23:40:29 +02001176=item B<serial>
1177
1178 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1179
1180Convert archives sequentially. The inputs are not merged but treated
1181as they are (so they may be premerged or globs).
1182the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001183are created based on the archive name. In case the C<--to-tar> flag is given,
1184the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001185
1186
Akron941c1a62016-02-23 17:41:41 +01001187=back
1188
1189
1190=head1 OPTIONS
1191
1192=over 2
1193
Akrona76d8352016-10-27 16:27:32 +02001194=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001195
Akrona76d8352016-10-27 16:27:32 +02001196Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001197
Akron7606afa2016-10-25 16:23:49 +02001198Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001199document, while C<archive> expects a KorAP-XML corpus folder or a zip
1200file to batch process multiple files.
1201C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001202
Akrona76d8352016-10-27 16:27:32 +02001203C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001204that the first archive listed contains all primary data files
1205and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001206
Akron7606afa2016-10-25 16:23:49 +02001207 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001208
Akron821db3d2017-04-06 21:19:31 +02001209Input may also be defined using BSD glob wildcards.
1210
1211 -i 'file/news*.zip'
1212
1213The extended input array will be sorted in length order, so the shortest
1214path needs to contain all primary data files and all meta data files.
1215
Akron0c3e3752016-06-28 15:55:53 +02001216(The directory structure follows the base directory format,
1217that may include a C<.> root folder.
1218In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001219need to be passed with a hash sign in front of the archive's name.
1220This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001221
Akron7606afa2016-10-25 16:23:49 +02001222To support zip files, a version of C<unzip> needs to be installed that is
1223compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001224
Akron7606afa2016-10-25 16:23:49 +02001225B<The root folder switch using the hash sign is experimental and
1226may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001227
Akronf73ffb62018-06-27 12:13:59 +02001228
Akron63f20d42017-04-10 23:40:29 +02001229=item B<--input-base|-ib> <directory>
1230
1231The base directory for inputs.
1232
1233
Akron941c1a62016-02-23 17:41:41 +01001234=item B<--output|-o> <directory|file>
1235
1236Output folder for archive processing or
1237document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001238writes to C<STDOUT> by default
1239(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001240
1241=item B<--overwrite|-w>
1242
1243Overwrite files that already exist.
1244
Akronf73ffb62018-06-27 12:13:59 +02001245
Akron3741f8b2016-12-21 19:55:21 +01001246=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001247
1248Define the default tokenization by specifying
1249the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001250of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001251This will directly take the file instead of running
1252the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001253
Akron3741f8b2016-12-21 19:55:21 +01001254
1255=item B<--base-sentences|-bs> <foundry>#<layer>
1256
1257Define the layer for base sentences.
1258If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001259Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1260layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001261
1262 Defaults to unset.
1263
1264
1265=item B<--base-paragraphs|-bp> <foundry>#<layer>
1266
1267Define the layer for base paragraphs.
1268If given, this will be used instead of using C<Base#Paragraphs>.
1269Currently C<DeReKo#Structure> is the only additional layer supported.
1270
1271 Defaults to unset.
1272
1273
Akron41ac10b2017-02-08 22:47:25 +01001274=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1275
1276Define the layer for base pagebreaks.
1277Currently C<DeReKo#Structure> is the only layer supported.
1278
1279 Defaults to unset.
1280
1281
Akron941c1a62016-02-23 17:41:41 +01001282=item B<--skip|-s> <foundry>[#<layer>]
1283
Akronf7ad89e2016-03-16 18:22:47 +01001284Skip specific annotations by specifying the foundry
1285(and optionally the layer with a C<#>-prefix),
1286e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001287Can be set multiple times.
1288
Akronf73ffb62018-06-27 12:13:59 +02001289
Akronc13a1702016-03-15 19:33:14 +01001290=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001291
Akronf7ad89e2016-03-16 18:22:47 +01001292Convert specific annotations by specifying the foundry
1293(and optionally the layer with a C<#>-prefix),
1294e.g. C<Mate> or C<Mate#Morpho>.
1295Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001296
Akronf73ffb62018-06-27 12:13:59 +02001297
Akron941c1a62016-02-23 17:41:41 +01001298=item B<--primary|-p>
1299
Akronc13a1702016-03-15 19:33:14 +01001300Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001301Can be flagged using C<--no-primary> as well.
1302This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001303
Akronf73ffb62018-06-27 12:13:59 +02001304
Akroned9baf02019-01-22 17:03:25 +01001305=item B<--non-word-tokens|-nwt>
1306
1307Tokenize non-word tokens like word tokens (defined as matching
1308C</[\d\w]/>). Useful to treat punctuations as tokens.
1309
1310 Defaults to unset.
1311
Akronf1849aa2019-12-16 23:35:33 +01001312
1313=item B<--non-verbal-tokens|-nvt>
1314
1315Tokenize non-verbal tokens marked as in the primary data as
1316the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1317
1318 Defaults to unset.
1319
1320
Akron941c1a62016-02-23 17:41:41 +01001321=item B<--jobs|-j>
1322
1323Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001324for archive processing.
Akron11c80302016-03-18 19:44:43 +01001325Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001326
1327If C<sequential-extraction> is not set to false, this will
1328also apply to extraction.
1329
Akronc11f7982017-02-21 21:20:14 +01001330Pass -1, and the value will be set automatically to 5
1331times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001332This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001333
Akronf73ffb62018-06-27 12:13:59 +02001334
Akron263274c2019-02-07 09:48:30 +01001335=item B<--koral|-k>
1336
1337Version of the output format. Supported versions are:
1338C<0> for legacy serialization, C<0.03> for serialization
1339with metadata fields as key-values on the root object,
1340C<0.4> for serialization with metadata fields as a list
1341of C<"@type":"koral:field"> objects.
1342
1343Currently defaults to C<0.03>.
1344
1345
Akron9ec88872017-04-12 16:29:06 +02001346=item B<--sequential-extraction|-se>
1347
1348Flag to indicate, if the C<jobs> value also applies to extraction.
1349Some systems may have problems with extracting multiple archives
1350to the same folder at the same time.
1351Can be flagged using C<--no-sequential-extraction> as well.
1352Defaults to C<false>.
1353
Akronf73ffb62018-06-27 12:13:59 +02001354
Akron35db6e32016-03-17 22:42:22 +01001355=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001356
Akron35db6e32016-03-17 22:42:22 +01001357Define the metadata parser to use. Defaults to C<I5>.
1358Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1359This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001360
Akronf73ffb62018-06-27 12:13:59 +02001361
Akron941c1a62016-02-23 17:41:41 +01001362=item B<--pretty|-y>
1363
Akronc13a1702016-03-15 19:33:14 +01001364Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001365This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001366
Akronf73ffb62018-06-27 12:13:59 +02001367
Akron941c1a62016-02-23 17:41:41 +01001368=item B<--gzip|-z>
1369
Akronf7ad89e2016-03-16 18:22:47 +01001370Compress the output.
1371Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001372
Akronf73ffb62018-06-27 12:13:59 +02001373
Akron11c80302016-03-18 19:44:43 +01001374=item B<--cache|-c>
1375
1376File to mmap a cache (using L<Cache::FastMmap>).
1377Defaults to C<korapxml2krill.cache> in the calling directory.
1378
Akronf73ffb62018-06-27 12:13:59 +02001379
Akron11c80302016-03-18 19:44:43 +01001380=item B<--cache-size|-cs>
1381
1382Size of the cache. Defaults to C<50m>.
1383
Akronf73ffb62018-06-27 12:13:59 +02001384
Akron11c80302016-03-18 19:44:43 +01001385=item B<--cache-init|-ci>
1386
1387Initialize cache file.
1388Can be flagged using C<--no-cache-init> as well.
1389Defaults to C<true>.
1390
Akronf73ffb62018-06-27 12:13:59 +02001391
Akron11c80302016-03-18 19:44:43 +01001392=item B<--cache-delete|-cd>
1393
1394Delete cache file after processing.
1395Can be flagged using C<--no-cache-delete> as well.
1396Defaults to C<true>.
1397
Akronf73ffb62018-06-27 12:13:59 +02001398
Akron636aa112017-04-07 18:48:56 +02001399=item B<--config|-cfg>
1400
1401Configure the parameters of your call in a file
1402of key-value pairs with whitespace separator
1403
1404 overwrite 1
1405 token DeReKo#Structure
1406 ...
1407
1408Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001409C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001410C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001411C<output>, C<koral>,
1412C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001413C<base-sentences>, C<base-paragraphs>,
1414C<base-pagebreaks>,
1415C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001416(semicolon separated), C<anno> (semicolon separated).
1417
Akronf73ffb62018-06-27 12:13:59 +02001418Configuration parameters will always be overwritten by
1419passed parameters.
1420
1421
Akron81500102017-04-07 20:45:44 +02001422=item B<--temporary-extract|-te>
1423
1424Only valid for the C<archive> command.
1425
1426This will first extract all files into a
1427directory and then will archive.
1428If the directory is given as C<:temp:>,
1429a temporary directory is used.
1430This is especially useful to avoid
1431massive unzipping and potential
1432network latency.
Akron636aa112017-04-07 18:48:56 +02001433
Akronf73ffb62018-06-27 12:13:59 +02001434
Akronc93a0802019-07-11 15:48:34 +02001435=item B<--to-tar>
1436
1437Only valid for the C<archive> command.
1438
1439Writes the output into a tar archive.
1440
1441
Akrone10ad322016-02-27 10:54:26 +01001442=item B<--sigle|-sg>
1443
Akron20807582016-10-26 17:11:34 +02001444Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001445Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001446I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001447Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001448In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001449On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001450
Akronf73ffb62018-06-27 12:13:59 +02001451
Akron941c1a62016-02-23 17:41:41 +01001452=item B<--log|-l>
1453
1454The L<Log4perl> log level, defaults to C<ERROR>.
1455
Akronf73ffb62018-06-27 12:13:59 +02001456
Akron941c1a62016-02-23 17:41:41 +01001457=item B<--help|-h>
1458
Akron42f48c12020-02-14 13:08:13 +01001459Print help information.
Akron941c1a62016-02-23 17:41:41 +01001460
Akronf73ffb62018-06-27 12:13:59 +02001461
Akron941c1a62016-02-23 17:41:41 +01001462=item B<--version|-v>
1463
1464Print version information.
1465
1466=back
1467
Akronf73ffb62018-06-27 12:13:59 +02001468
Akronc13a1702016-03-15 19:33:14 +01001469=head1 ANNOTATION SUPPORT
1470
1471L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1472developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1473The base foundry with paragraphs, sentences, and the text element are mandatory for
1474L<Krill|https://github.com/KorAP/Krill>.
1475
Akron821db3d2017-04-06 21:19:31 +02001476 Base
1477 #Paragraphs
1478 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001479
Akron821db3d2017-04-06 21:19:31 +02001480 Connexor
1481 #Morpho
1482 #Phrase
1483 #Sentences
1484 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001485
Akron821db3d2017-04-06 21:19:31 +02001486 CoreNLP
1487 #Constituency
1488 #Morpho
1489 #NamedEntities
1490 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001491
Akronce125b62017-06-19 11:54:36 +02001492 CMC
1493 #Morpho
1494
Akron821db3d2017-04-06 21:19:31 +02001495 DeReKo
1496 #Structure
Akronc13a1702016-03-15 19:33:14 +01001497
Akron57510c12019-01-04 14:58:53 +01001498 DGD
1499 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001500 #Structure
Akron57510c12019-01-04 14:58:53 +01001501
Akron821db3d2017-04-06 21:19:31 +02001502 DRuKoLa
1503 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001504
Akron821db3d2017-04-06 21:19:31 +02001505 Glemm
1506 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001507
Akronea1aed52018-07-19 14:43:34 +02001508 HNC
1509 #Morpho
1510
Akron4c679192018-01-16 17:41:49 +01001511 LWC
1512 #Dependency
1513
Akron821db3d2017-04-06 21:19:31 +02001514 Malt
1515 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001516
Akron821db3d2017-04-06 21:19:31 +02001517 MarMoT
1518 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001519
Akron821db3d2017-04-06 21:19:31 +02001520 Mate
1521 #Dependency
1522 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001523
Akron821db3d2017-04-06 21:19:31 +02001524 MDParser
1525 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001526
Akron821db3d2017-04-06 21:19:31 +02001527 OpenNLP
1528 #Morpho
1529 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001530
Akron07e24772020-04-23 14:00:54 +02001531 RWK
1532 #Morpho
1533 #Structure
1534
Akron821db3d2017-04-06 21:19:31 +02001535 Sgbr
1536 #Lemma
1537 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001538
Akron7d5e6382019-08-08 16:36:27 +02001539 Talismane
1540 #Dependency
1541 #Morpho
1542
Akron821db3d2017-04-06 21:19:31 +02001543 TreeTagger
1544 #Morpho
1545 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001546
Akron821db3d2017-04-06 21:19:31 +02001547 XIP
1548 #Constituency
1549 #Morpho
1550 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001551
Akronc13a1702016-03-15 19:33:14 +01001552
1553More importers are in preparation.
1554New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1555See the built-in annotation importers as examples.
1556
Akronf73ffb62018-06-27 12:13:59 +02001557
Akron8f69d632020-01-15 16:58:11 +01001558=head1 About KorAP-XML
1559
1560KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1561data model (Bański et al. 2013), where text data are stored physically
1562separated from their interpretations (i.e. annotations).
1563A text document in KorAP-XML therefore consists of several files
1564containing primary data, metadata and annotations.
1565
1566The structure of a single KorAP-XML document can be as follows:
1567
1568 - data.xml
1569 - header.xml
1570 + base
1571 - tokens.xml
1572 - ...
1573 + struct
1574 - structure.xml
1575 - ...
1576 + corenlp
1577 - morpho.xml
1578 - constituency.xml
1579 - ...
1580 + tree_tagger
1581 - morpho.xml
1582 - ...
1583 - ...
1584
1585The C<data.xml> contains the primary data, the C<header.xml> contains
1586the metadata, and the annotation layers are stored in subfolders
1587like C<base>, C<struct> or C<corenlp>
1588(so-called "foundries"; Bański et al. 2013).
1589
1590Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001591(Lüngen and Sperberg-McQueen 2012). See the documentation in
1592L<KorAP::XML::Meta::I5> for translatable fields.
1593
1594Annotations correspond to a variant of the TEI-P5 feature structures
1595(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001596Annotation feature structures refer to character sequences of the primary text
1597inside the C<text> element of the C<data.xml>.
1598A single annotation containing the lemma of a token can have the following structure:
1599
1600 <span from="0" to="3">
1601 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1602 <f name="lex">
1603 <fs>
1604 <f name="lemma">zum</f>
1605 </fs>
1606 </f>
1607 </fs>
1608 </span>
1609
1610The C<from> and C<to> attributes are refering to the character span
1611in the primary text.
1612Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1613the structure may vary. See L<KorAP::XML::Annotation::*> for various
1614annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001615
1616Multiple KorAP-XML documents are organized on three levels following
1617the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1618corpus E<gt> document E<gt> text. On each level metadata information
1619can be stored, that C<korapxml2krill> will merge to a single metadata
1620object per text. A corpus is therefore structured as follows:
1621
1622 + <corpus>
1623 - header.xml
1624 + <document>
1625 - header.xml
1626 + <text>
1627 - data.xml
1628 - header.xml
1629 - ...
1630 - ...
1631
1632A single text can be identified by the concatenation of
1633the corpus identifier, the document identifier and the text identifier.
1634This identifier is called the text sigle
1635(e.g. a text with the identifier C<18486> in the document C<060> in the
1636corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1637
1638These corpora are often stored in zip files, with which C<korapxml2krill>
1639can deal with. Corpora may also be split in multiple zip archives
1640(e.g. one zip file per foundry), which is also supported (see C<--input>).
1641
1642Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1643in form of a test suite.
1644The resulting JSON format merges all annotation layers
1645based on a single token stream.
1646
1647=head2 References
1648
1649Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1650KorAP data model: first approximation, December.
1651
1652Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1653"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1654Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1655L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1656
1657Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1658"Robust corpus architecture: a new look at virtual collections and data access",
1659Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1660L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1661
1662Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1663Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1664"Towards an international standard on featurestructure representation",
1665Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1666pp. 373-376.
1667L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1668
1669Harald Lüngen and C. M. Sperberg-McQueen (2012):
1670"A TEI P5 Document Grammar for the IDS Text Model",
1671Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1672L<PDF|https://journals.openedition.org/jtei/pdf/508>
1673
1674TEI Consortium, eds:
1675"Feature Structures",
1676Guidelines for Electronic Text Encoding and Interchange.
1677L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1678
Akron941c1a62016-02-23 17:41:41 +01001679=head1 AVAILABILITY
1680
1681 https://github.com/KorAP/KorAP-XML-Krill
1682
1683
1684=head1 COPYRIGHT AND LICENSE
1685
Akron8f69d632020-01-15 16:58:11 +01001686Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001687
Akron8f69d632020-01-15 16:58:11 +01001688Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001689
Akrona76d8352016-10-27 16:27:32 +02001690Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001691
1692L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1693Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001694L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001695member of the
Akronf1849aa2019-12-16 23:35:33 +01001696L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001697
1698This program is free software published under the
1699L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1700
1701=cut