blob: 0138423d8d8e26a67f7953a8798ac387cc00146c [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akronc4ec0932020-08-06 09:19:22 +020015use KorAP::XML::Krill qw!get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
Akronf1849aa2019-12-16 23:35:33 +0100146# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100149# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200150#
151# 2020/04/23
152# - Added support for Redewiedergabe-Korpus structure
153# annotations, based on sentence and paragraph milestones
154# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100155# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100156
Akron07e24772020-04-23 14:00:54 +0200157our $LAST_CHANGE = '2020/04/23';
Akron941c1a62016-02-23 17:41:41 +0100158our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100159our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100160our $VERSION_MSG = <<"VERSION";
161Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
162VERSION
163
Akron63f20d42017-04-10 23:40:29 +0200164# Prototypes
Akron63f20d42017-04-10 23:40:29 +0200165sub get_file_name($);
166
Akron941c1a62016-02-23 17:41:41 +0100167# Parse comand
168my $cmd;
169our @ARGV;
170if ($ARGV[0] && index($ARGV[0], '-') != 0) {
171 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100172};
Akron63f20d42017-04-10 23:40:29 +0200173my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100174
Akron5f51d422016-08-16 16:26:43 +0200175my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100176my $text;
Akrone10ad322016-02-27 10:54:26 +0100177
Akron941c1a62016-02-23 17:41:41 +0100178# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000179GetOptions(
Akron08385f62016-03-22 20:37:04 +0100180 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200181 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100182 'output|o=s' => \(my $output),
183 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100184 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200185 'token|t=s' => \(my $token_base),
186 'base-sentences|bs=s' => \(my $base_sentences),
187 'base-paragraphs|bp=s' => \(my $base_paragraphs),
188 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100189 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200190 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100191 'skip|s=s' => \@skip,
192 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200193 'cache|c=s' => \(my $cache_file),
194 'config|cfg=s' => \(my $cfg_file),
195 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200196 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100197 'primary|p!' => \(my $primary),
198 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200199 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100200 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200201 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100202 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akronf1849aa2019-12-16 23:35:33 +0100203 'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
Akron9ec88872017-04-12 16:29:06 +0200204 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200205 'cache-size|cs=s' => \(my $cache_size),
206 'cache-delete|cd!' => \(my $cache_delete),
207 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100208 'help|h' => sub {
209 pod2usage(
210 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200211 -verbose => 99,
212 -msg => $VERSION_MSG,
213 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100214 );
215 },
216 'version|v' => sub {
217 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200218 -verbose => 0,
219 -msg => $VERSION_MSG,
220 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100221 )
222 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000223);
224
Akron63f20d42017-04-10 23:40:29 +0200225
Akron636aa112017-04-07 18:48:56 +0200226# Load from configuration
227if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200228 my %config;
229
230 Config::Simple->import_from($cfg_file, \%config);
231
232 # Overwrite
233 if (!defined($overwrite) && defined $config{overwrite}) {
234 $overwrite = $config{overwrite};
235 };
236
237 # Gzip
238 if (!defined($gzip) && defined $config{gzip}) {
239 $gzip = $config{gzip};
240 };
241
242 # Jobs
243 if (!defined($jobs) && defined $config{jobs}) {
244 $jobs = $config{jobs};
245 };
246
Akron263274c2019-02-07 09:48:30 +0100247 # Koral version
248 if (!defined($koral) && defined $config{koral}) {
249 $koral = $config{koral};
250 };
251
Akron63f20d42017-04-10 23:40:29 +0200252 # Input root base directory
253 if (!defined($input_base) && defined $config{'input-base'}) {
254 $input_base = $config{'input-base'};
255 };
256
Akron81500102017-04-07 20:45:44 +0200257 # temporary-extract
258 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
259 $extract_dir = $config{'temporary-extract'};
260 };
261
Akron636aa112017-04-07 18:48:56 +0200262 # Token base
263 if (!defined($token_base) && defined $config{token}) {
264 $token_base = $config{token};
265 };
266
Akronf1849aa2019-12-16 23:35:33 +0100267 # Non-word tokenization
Akroned9baf02019-01-22 17:03:25 +0100268 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
269 $non_word_tokens = $config{'non-word-tokens'};
270 };
271
Akronf1849aa2019-12-16 23:35:33 +0100272 # Non-verbal tokenization
273 if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
274 $non_verbal_tokens = $config{'non-verbal-tokens'};
275 };
276
Akron636aa112017-04-07 18:48:56 +0200277 # Cache file
278 if (!defined($cache_file) && defined $config{cache}) {
279 $cache_file = $config{cache};
280 };
281
282 # Cache size
283 if (!defined($cache_size) && defined $config{'cache-size'}) {
284 $cache_size = $config{'cache-size'};
285 };
286
287 # Cache delete
288 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
289 $cache_delete = $config{'cache-delete'} ;
290 };
291
292 # Cache init
293 if (!(defined $cache_init) && defined $config{'cache-init'}) {
294 $cache_init = $config{'cache-init'} ;
295 };
296
Akron9ec88872017-04-12 16:29:06 +0200297 # Jobs for extraction
298 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
299 $sequential_extraction = $config{'sequential-extraction'} ;
300 };
301
Akron636aa112017-04-07 18:48:56 +0200302 # Meta
303 if (!(defined $meta) && defined $config{'meta'}) {
304 $meta = $config{'meta'} ;
305 };
306
307 # Output
308 if (!(defined $output) && defined $config{'output'}) {
309 $output = $config{'output'} ;
310 };
311
312 # Base-sentences
313 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
314 $base_sentences = $config{'base-sentences'} ;
315 };
316
317 # Base-paragraphs
318 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
319 $base_paragraphs = $config{'base-paragraphs'} ;
320 };
321
322 # Base-pagebreaks
323 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
324 $base_pagebreaks = $config{'base-pagebreaks'} ;
325 };
326
Akron081639e2017-04-21 19:01:39 +0200327 # Write to tar
328 if (!(defined $to_tar) && defined $config{'to-tar'}) {
329 $to_tar = $config{'to-tar'} ;
330 };
331
Akron636aa112017-04-07 18:48:56 +0200332 # Log
333 if (!(defined $log_level) && defined $config{'log'}) {
334 $log_level = $config{'log'} ;
335 };
336
337 # Skip
338 if (!scalar(@skip) && defined $config{'skip'}) {
339 @skip = split /\s*;\s*/, $config{'skip'} ;
340 };
341
342 # Sigle
343 if (!scalar(@sigle) && defined $config{'sigle'}) {
344 @sigle = split /\s*;\s*/, $config{'sigle'} ;
345 };
346
347 # Anno
348 if (!scalar(@anno) && defined $config{'anno'}) {
349 @anno = split /\s*;\s*/, $config{'anno'} ;
350 };
351};
352
Akron63f20d42017-04-10 23:40:29 +0200353
Akron636aa112017-04-07 18:48:56 +0200354# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200355$token_base //= 'OpenNLP#tokens';
356$cache_file //= 'korapxml2krill.cache';
357$cache_size //= '50m';
358$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100359$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200360$cache_delete //= 1;
361$cache_init //= 1;
362$sequential_extraction //= 0;
363$log_level //= 'ERROR';
364$base_sentences //= '';
365$base_paragraphs //= '';
366$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100367$non_word_tokens //= 0;
Akronf1849aa2019-12-16 23:35:33 +0100368$non_verbal_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200369
Akron821db3d2017-04-06 21:19:31 +0200370$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100371$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100372$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100373
Akron63f20d42017-04-10 23:40:29 +0200374
375# Initialize log4perl object
376Log::Log4perl->init({
377 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
378 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
379 'log4perl.appender.STDERR.layout' => 'PatternLayout',
380 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
381});
382
383my $log = Log::Log4perl->get_logger('main');
384
385
386print "Reading config from $cfg_file\n" if $cfg_file;
387
388
Akron941c1a62016-02-23 17:41:41 +0100389my %ERROR_HASH = (
390 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200391 -verbose => 99,
392 -msg => $VERSION_MSG,
393 -output => '-',
394 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100395);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000396
Akron941c1a62016-02-23 17:41:41 +0100397# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100398pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000399
Akrone1dbc382016-07-08 22:24:52 +0200400# Gzip has no effect, if no output is given
401pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000402
Akronc11f7982017-02-21 21:20:14 +0100403
Akron636aa112017-04-07 18:48:56 +0200404if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100405 state $cores = Sys::Info->new->device('CPU')->count;
406 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200407 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100408};
409
Akron821db3d2017-04-06 21:19:31 +0200410
Akron63f20d42017-04-10 23:40:29 +0200411# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200412if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200413
Akron486f9ab2017-04-22 23:25:19 +0200414 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200415 $log->error("Directory '$output' does not exist.");
416 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200417 };
418
419 # Remove all inputs
420 my $remove_next = 0;
421 @keep_argv = @{c(@keep_argv)->grep(
422 sub {
423 # Input flag
424 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
425 $remove_next = 1;
426 return 0;
427 }
428
429 # input value
430 elsif ($remove_next) {
431 $remove_next = 0;
432 return 0;
433 };
434
435 # Pass parameter
436 return 1;
437 }
438 )->to_array};
439
440
441 # Iterate over all inputs
442 foreach (@input) {
443
Akron081639e2017-04-21 19:01:39 +0200444 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200445 my $new_out = catdir($output, get_file_name_from_glob($_));
446
Akron486f9ab2017-04-22 23:25:19 +0200447 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200448 unless ($to_tar) {
449 if (make_path($new_out) == 0 && !-d $new_out) {
450 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200451 exit 1;
Akron081639e2017-04-21 19:01:39 +0200452 };
Akron63f20d42017-04-10 23:40:29 +0200453 };
454
455 # Create archive command
456 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
457 print "Start serial processing of $_ to $new_out\n";
458
459 # Start archiving
460 system @archive_cmd;
461 };
462
Akron3abc03e2017-06-29 16:23:35 +0200463 exit;
Akron63f20d42017-04-10 23:40:29 +0200464};
465
Akrone1dbc382016-07-08 22:24:52 +0200466my %skip;
467$skip{lc($_)} = 1 foreach @skip;
468
469my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100470push(@layers, ['Base', 'Sentences']) unless $base_sentences;
471push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200472
473# Connexor
474push(@layers, ['Connexor', 'Morpho']);
475push(@layers, ['Connexor', 'Syntax']);
476push(@layers, ['Connexor', 'Phrase']);
477push(@layers, ['Connexor', 'Sentences']);
478
479# CoreNLP
480push(@layers, ['CoreNLP', 'NamedEntities']);
481push(@layers, ['CoreNLP', 'Sentences']);
482push(@layers, ['CoreNLP', 'Morpho']);
483push(@layers, ['CoreNLP', 'Constituency']);
484
Akronce125b62017-06-19 11:54:36 +0200485# CMC
486push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100487
Akrone1dbc382016-07-08 22:24:52 +0200488# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100489my @dereko_attr = ();
490if ($base_sentences eq 'dereko#structure') {
491 push @dereko_attr, 'sentences';
492};
493if ($base_paragraphs eq 'dereko#structure') {
494 push @dereko_attr, 'paragraphs';
495};
Akron636bd9c2017-02-09 17:13:00 +0100496
Akron41ac10b2017-02-08 22:47:25 +0100497if ($base_pagebreaks eq 'dereko#structure') {
498 push @dereko_attr, 'pagebreaks';
499};
500
501if ($dereko_attr[0]) {
502 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100503}
504else {
505 push(@layers, ['DeReKo', 'Structure']);
506};
Akrone1dbc382016-07-08 22:24:52 +0200507
Akron57510c12019-01-04 14:58:53 +0100508# DGD
509push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100510if ($base_sentences eq 'dgd#structure') {
511 push(@layers, ['DGD', 'Structure', 'base-sentence']);
512}
Akron57510c12019-01-04 14:58:53 +0100513
514# DRuKoLa
515push(@layers, ['DRuKoLa', 'Morpho']);
516
Akrone1dbc382016-07-08 22:24:52 +0200517# Glemm
518push(@layers, ['Glemm', 'Morpho']);
519
Akronea1aed52018-07-19 14:43:34 +0200520# HNC
521push(@layers, ['HNC', 'Morpho']);
522
Akron4c679192018-01-16 17:41:49 +0100523# LWC
524push(@layers, ['LWC', 'Dependency']);
525
Akrone1dbc382016-07-08 22:24:52 +0200526# Malt
527push(@layers, ['Malt', 'Dependency']);
528
Akron57510c12019-01-04 14:58:53 +0100529# Marmot
530push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200531
532# Mate
533push(@layers, ['Mate', 'Morpho']);
534push(@layers, ['Mate', 'Dependency']);
535
Akron57510c12019-01-04 14:58:53 +0100536# MDParser
537push(@layers, ['MDParser', 'Dependency']);
538
Akrone1dbc382016-07-08 22:24:52 +0200539# OpenNLP
540push(@layers, ['OpenNLP', 'Morpho']);
541push(@layers, ['OpenNLP', 'Sentences']);
542
Akron07e24772020-04-23 14:00:54 +0200543# Redewiedergabe
544push(@layers, ['RWK', 'Morpho']);
545if ($base_sentences eq 'rwk#structure') {
546 push(@layers, ['RWK', 'Structure']);
547};
548
Akrone1dbc382016-07-08 22:24:52 +0200549# Schreibgebrauch
550push(@layers, ['Sgbr', 'Lemma']);
551push(@layers, ['Sgbr', 'Morpho']);
552
Akron7d5e6382019-08-08 16:36:27 +0200553# Talismane
554push(@layers, ['Talismane', 'Dependency']);
555push(@layers, ['Talismane', 'Morpho']);
556
Akrone1dbc382016-07-08 22:24:52 +0200557# TreeTagger
558push(@layers, ['TreeTagger', 'Morpho']);
559push(@layers, ['TreeTagger', 'Sentences']);
560
561# XIP
562push(@layers, ['XIP', 'Morpho']);
563push(@layers, ['XIP', 'Constituency']);
564push(@layers, ['XIP', 'Sentences']);
565push(@layers, ['XIP', 'Dependency']);
566
Akron4fa37c32017-01-20 14:43:10 +0100567
Akrone1dbc382016-07-08 22:24:52 +0200568# Check filters
569my @filtered_anno;
570if ($skip{'#all'}) {
571 foreach (@anno) {
572 push @filtered_anno, [ split('#', $_) ];
573 };
574}
575
576# Add all annotations that are not skipped
577else {
578 # Add to index file - respect skipping
579 foreach my $info (@layers) {
580 # Skip if Foundry or Foundry#Layer should be skipped
581 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
582 push @filtered_anno, $info;
583 };
584 };
585};
586
587# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200588my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
589
590# Remove file extension
591$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200592
593# TODO: This should not be initialized for batch
594my $cache = Cache::FastMmap->new(
595 share_file => $cache_file,
596 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200597 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200598);
599
Akron03b24db2016-08-16 20:54:32 +0200600# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200601my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200602 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200603 meta_type => $meta,
604 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200605 foundry => $token_base_foundry,
606 layer => $token_base_layer,
607 gzip => $gzip,
608 log => $log,
Akron263274c2019-02-07 09:48:30 +0100609 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200610 primary => $primary,
611 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100612 anno => \@filtered_anno,
Akronf1849aa2019-12-16 23:35:33 +0100613 non_word_tokens => $non_word_tokens,
614 non_verbal_tokens => $non_verbal_tokens
Akrone1dbc382016-07-08 22:24:52 +0200615);
616
Akron941c1a62016-02-23 17:41:41 +0100617# Get file name based on path information
618sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100619 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200620 if (-d $i) {
621 $i =~ s![^\/]+$!!;
622 };
Akron941c1a62016-02-23 17:41:41 +0100623 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200624
625 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200626 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100627 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100628 $file =~ tr/\//-/;
629 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200630 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100631 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000632};
633
Akron63f20d42017-04-10 23:40:29 +0200634
Akrone10ad322016-02-27 10:54:26 +0100635# Convert sigle to path construct
636s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
637
Akron7d4cdd82016-08-17 21:39:45 +0200638if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200639 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200640 $log->error("Directory '$output' does not exist.");
641 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200642 };
643};
644
Akron63f20d42017-04-10 23:40:29 +0200645
646# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200647if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200648
Akron821db3d2017-04-06 21:19:31 +0200649 my @new_input = ();
650
651 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200652 foreach my $wild_card (@input) {
653
654 # Prefix with input root
655 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
656
657 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200658 };
659
Akron63f20d42017-04-10 23:40:29 +0200660 # Sort files by length
661 @input = sort { length($a) <=> length($b) } @new_input;
662
663 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200664};
665
666
Akron941c1a62016-02-23 17:41:41 +0100667# Process a single file
668unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100669 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000670
Akron941c1a62016-02-23 17:41:41 +0100671 BEGIN {
672 $main::TIME = Benchmark->new;
673 $main::LAST_STOP = Benchmark->new;
674 };
675
676 sub stop_time {
677 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200678 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100679 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200680 timestr(timediff($new, $main::LAST_STOP)) .
681 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
682 );
Akron941c1a62016-02-23 17:41:41 +0100683 $main::LAST_STOP = $new;
684 };
685
686 # Create and parse new document
687 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100688
Akron7d4cdd82016-08-17 21:39:45 +0200689 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200690 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100691
Akron11c80302016-03-18 19:44:43 +0100692 # Delete cache file
693 unlink($cache_file) if $cache_delete;
694
Akron5f51d422016-08-16 16:26:43 +0200695 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200696 exit;
Akron81500102017-04-07 20:45:44 +0200697};
698
Nils Diewald59094f22014-11-05 18:20:50 +0000699
Akrone10ad322016-02-27 10:54:26 +0100700# Extract XML files
Akron81500102017-04-07 20:45:44 +0200701if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100702
Akrond5643ad2017-07-04 20:27:13 +0200703 # Output is required
704 pod2usage(%ERROR_HASH) unless $output;
705
Akron7d4cdd82016-08-17 21:39:45 +0200706 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200707 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100708
Akron7d4cdd82016-08-17 21:39:45 +0200709 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100710 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200711 $log->error("Unzip is not installed or incompatible.");
712 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100713 };
714
Akronb0c88db2016-06-29 16:33:18 +0200715 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200716 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200717
Akron31a08cb2019-02-20 20:43:26 +0100718 # Will set @sigle
719 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200720
Akrone10ad322016-02-27 10:54:26 +0100721 # Iterate over all given sigles and extract
722 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100723
Akron2812ba22016-10-28 21:55:59 +0200724 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200725
Akron03b24db2016-08-16 20:54:32 +0200726 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200727 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100728
Akron955b75b2019-02-21 14:28:41 +0100729 # TODO:
730 # - prefix???
731 $archive->extract_sigle([$_], $output, $jobs)
732 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200733 );
Akrone10ad322016-02-27 10:54:26 +0100734 print "extracted.\n";
735 };
Akronb0c88db2016-06-29 16:33:18 +0200736 }
Akron7d4cdd82016-08-17 21:39:45 +0200737
738 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200739 else {
740 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200741 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100742 };
743}
744
Akron81500102017-04-07 20:45:44 +0200745
Akron941c1a62016-02-23 17:41:41 +0100746# Process an archive
747elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000748
Akron81500102017-04-07 20:45:44 +0200749 my $archive_output;
750
751 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100752 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200753
754 # Create new archive object
755 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
756
757 # Check zip capabilities
758 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200759 $log->error("Unzip is not installed or incompatible.");
760 exit 1;
Akron81500102017-04-07 20:45:44 +0200761 };
762
763 # Add further annotation archived
764 $archive->attach($_) foreach @input[1..$#input];
765
766 # Create a temporary directory
767 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200768 $extract_dir = tempdir(CLEANUP => 0);
769 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200770 };
771
Akron63f20d42017-04-10 23:40:29 +0200772 # Add some random extra to avoid clashes with multiple archives
773 $extract_dir = catdir($extract_dir, random_string('cccccc'));
774
Akron31a08cb2019-02-20 20:43:26 +0100775 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200776 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200777 @input = ($extract_dir);
778 }
779 else {
780 $log->error('Unable to extract from primary archive ' . $input[0] .
781 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200782 exit 1;
Akron81500102017-04-07 20:45:44 +0200783 };
784 }
785
786 # Can't create archive object
787 else {
788 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200789 exit 1;
Akron81500102017-04-07 20:45:44 +0200790 };
791 };
792
Akron7d4cdd82016-08-17 21:39:45 +0200793 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100794 my $pool = Parallel::ForkManager->new($jobs);
795
Akron7d4cdd82016-08-17 21:39:45 +0200796 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100797 my $iter = 1; # Current text in process
798
Akronda3097e2017-04-23 19:53:57 +0200799 my $tar_archive;
800 my $output_dir = $output;
801 my $tar_fh;
802
803 # Initialize tar archive
804 if ($to_tar) {
805 $tar_archive = Archive::Tar::Builder->new(
806 ignore_errors => 1
807 );
808
809 # Set output name
810 my $tar_file = $output;
811 unless ($tar_file =~ /\.tar$/) {
812 $tar_file .= '.tar';
813 };
814
815 # Initiate the tar file
816 print "Writing to file $tar_file\n";
817 $tar_fh = IO::File->new($tar_file, 'w');
818 $tar_fh->binmode(1);
819
820 # Set handle
821 $tar_archive->set_handle($tar_fh);
822
823 # Output to temporary directory
824 $output_dir = File::Temp->newdir;
825 };
826
Akron941c1a62016-02-23 17:41:41 +0100827 # Report on fork message
828 $pool->run_on_finish (
829 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200830 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100831 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200832
Akron08385f62016-03-22 20:37:04 +0100833 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200834 ($iter++) . "/$count]" .
835 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200836 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200837
838 if (!$code && $to_tar && $data->[2]) {
839 my $filename = $data->[2];
840
841 # Lock filehandle
842 if (flock($tar_fh, LOCK_EX)) {
843
Akron9a062ce2017-07-04 19:12:05 +0200844 my $clean_file = fileparse($filename);
845
Akronda3097e2017-04-23 19:53:57 +0200846 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200847 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200848 unlink $filename;
849
850 # Unlock filehandle
851 flock($tar_fh, LOCK_UN);
852 }
853 else {
854 $log->warn("Unable to add $filename to archive");
855 };
856 };
857
Akron4c0cf312016-10-15 16:42:09 +0200858 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100859 }
860 );
861
862 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200863 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100864 print "Reading data ...\n";
865
Akron7d4cdd82016-08-17 21:39:45 +0200866 # unless (Cache::FastMmap->new(
867 # share_file => $cache_file,
868 # cache_size => $cache_size,
869 # init_file => $cache_init
870 # )) {
871 # print "Unable to intialize cache '$cache_file'\n\n";
872 # exit(1);
873 # };
Akron11c80302016-03-18 19:44:43 +0100874
Akron486f9ab2017-04-22 23:25:19 +0200875
Akron941c1a62016-02-23 17:41:41 +0100876 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100877 if (-d $input[0]) {
878 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100879 my @dirs;
880 my $dir;
881
Akron7d4cdd82016-08-17 21:39:45 +0200882 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100883 while (1) {
884 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200885 push @dirs, $dir;
886 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100887 };
888 last unless $it->next;
889 };
890
891 print "Start processing ...\n";
892 $t = Benchmark->new;
893 $count = scalar @dirs;
894
895 DIRECTORY_LOOP:
896 for (my $i = 0; $i < $count; $i++) {
897
Akrone1dbc382016-07-08 22:24:52 +0200898 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200899 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200900 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200901 );
Akron941c1a62016-02-23 17:41:41 +0100902
903 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200904 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200905
Akron13d56622016-10-31 14:54:49 +0100906 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200907 $pool->finish(
908 0,
Akronda3097e2017-04-23 19:53:57 +0200909 [
910 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
911 undef,
912 $filename
913 ]
Akron486f9ab2017-04-22 23:25:19 +0200914 );
Akron3ec48972016-08-17 23:24:52 +0200915 }
916 else {
Akron4c0cf312016-10-15 16:42:09 +0200917 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200918 };
Akron941c1a62016-02-23 17:41:41 +0100919 };
920 }
921
922 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200923 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200924
Akron941c1a62016-02-23 17:41:41 +0100925 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200926 $log->error("Unzip is not installed or incompatible.");
927 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100928 };
929
Akron08385f62016-03-22 20:37:04 +0100930 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200931 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100932
Akron31a08cb2019-02-20 20:43:26 +0100933 # Get sigles to extract
934 my $prefix = set_sigle($archive);
935
Akron941c1a62016-02-23 17:41:41 +0100936 print "Start processing ...\n";
937 $t = Benchmark->new;
938 my @dirs = $archive->list_texts;
939 $count = scalar @dirs;
940
941 ARCHIVE_LOOP:
942 for (my $i = 0; $i < $count; $i++) {
943
944 # Split path information
945 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
946
Akrone1dbc382016-07-08 22:24:52 +0200947 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200948 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200949 get_file_name(
950 catfile($corpus, $doc, $text)
951 . '.json' . ($gzip ? '.gz' : '')
952 )
Akrone1dbc382016-07-08 22:24:52 +0200953 );
Akron941c1a62016-02-23 17:41:41 +0100954
955 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200956 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100957
Akron4c0cf312016-10-15 16:42:09 +0200958 # Create temporary file
959 $temp = File::Temp->newdir;
960
Akronbdf434a2016-10-24 17:42:07 +0200961 # TODO: Check if $filename exist at the beginning,
962 # because extraction can be horrible slow!
963
Akron941c1a62016-02-23 17:41:41 +0100964 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100965 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100966
Akron7d4cdd82016-08-17 21:39:45 +0200967 # Create corpus directory
968 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100969
Akron7d4cdd82016-08-17 21:39:45 +0200970 # Temporary directory
971 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100972
Akron7d4cdd82016-08-17 21:39:45 +0200973 # Write file
Akron13d56622016-10-31 14:54:49 +0100974 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200975
Akron4c0cf312016-10-15 16:42:09 +0200976 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100977 $pool->finish(
978 0,
Akronda3097e2017-04-23 19:53:57 +0200979 [
980 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
981 $temp,
982 $filename
983 ]
Akron13d56622016-10-31 14:54:49 +0100984 );
985 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200986 }
987 else {
Akron4c0cf312016-10-15 16:42:09 +0200988 # Delete temporary file
989 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200990 };
Akron941c1a62016-02-23 17:41:41 +0100991 }
Akron7d4cdd82016-08-17 21:39:45 +0200992
993 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100994 else {
Akron4c0cf312016-10-15 16:42:09 +0200995 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100996 };
997 };
998 }
999
1000 else {
1001 print "Input is neither a directory nor an archive.\n\n";
1002 };
1003
1004 $pool->wait_all_children;
1005
Akron11c80302016-03-18 19:44:43 +01001006 # Delete cache file
1007 unlink($cache_file) if $cache_delete;
1008
Akronda3097e2017-04-23 19:53:57 +02001009 # Close tar filehandle
1010 if ($to_tar && $tar_fh) {
1011 $tar_archive->finish;
1012 $tar_fh->close;
1013 print "Wrote to tar archive.\n";
1014 };
1015
Akron63f20d42017-04-10 23:40:29 +02001016 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001017 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001018};
Akron941c1a62016-02-23 17:41:41 +01001019
Nils Diewald2db9ad02013-10-29 19:26:43 +00001020
Akron31a08cb2019-02-20 20:43:26 +01001021# For an archive, this will create the list
1022# of all sigles to process
1023sub set_sigle {
1024 my $archive = shift;
1025
1026 my $prefix = 1;
1027 my @dirs = ();
1028
1029 # No sigles given
1030 unless (@sigle) {
1031
1032 # Get files
1033 foreach ($archive->list_texts) {
1034
1035 push @dirs, $_;
1036
1037 # Split path information
1038 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1039
1040 # TODO: Make this OS independent
1041 push @sigle, join '/', $corpus, $doc, $text;
1042 };
1043 }
1044
1045 # Check sigle for doc sigles
1046 else {
1047 my @new_sigle;
1048
1049 my $prefix_check = 0;
1050
1051 # Iterate over all sigle
1052 foreach (@sigle) {
1053
1054 # Sigle is a doc sigle
1055 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1056
1057 print "$_ ...";
1058 # Check if a prefix is needed
1059 unless ($prefix_check) {
1060
1061 if ($prefix = $archive->check_prefix) {
1062 print " with prefix ...";
1063 };
1064 $prefix_check = 1;
1065 };
1066
1067 print "\n";
1068
Akron31a08cb2019-02-20 20:43:26 +01001069 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001070 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1071 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001072 );
1073 print "extracted.\n";
1074 }
1075
1076 # Sigle is a text sigle
1077 else {
1078 push @new_sigle, $_;
1079
1080 unless ($prefix_check) {
1081
1082 if ($prefix = $archive->check_prefix) {
1083 print " with prefix ...";
1084 };
1085 $prefix_check = 1;
1086 };
1087 };
1088 };
1089 @sigle = @new_sigle;
1090 };
1091
1092 return $prefix;
1093};
1094
1095
1096
Akron63f20d42017-04-10 23:40:29 +02001097# Cleanup temporary extraction directory
1098if ($extract_dir) {
1099 my $objects = remove_tree($extract_dir, { safe => 1 });
1100 print "Removed directory $extract_dir with $objects objects.\n";
1101};
1102
1103
1104print "\n";
1105
Nils Diewald2db9ad02013-10-29 19:26:43 +00001106__END__
Akron941c1a62016-02-23 17:41:41 +01001107
1108=pod
1109
1110=encoding utf8
1111
1112=head1 NAME
1113
Akron42f48c12020-02-14 13:08:13 +01001114korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001115
1116
1117=head1 SYNOPSIS
1118
Akrona76d8352016-10-27 16:27:32 +02001119 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001120
Akron2fd402b2016-10-27 21:26:48 +02001121
Akron941c1a62016-02-23 17:41:41 +01001122=head1 DESCRIPTION
1123
1124L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1125compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001126The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001127
1128
1129=head1 INSTALLATION
1130
1131The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1132
Akronaf386982016-10-12 00:33:25 +02001133 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001134
Akronc13a1702016-03-15 19:33:14 +01001135In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001136be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001137Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001138In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001139
1140=head1 ARGUMENTS
1141
Akrona76d8352016-10-27 16:27:32 +02001142 $ korapxml2krill -z --input <directory> --output <filename>
1143
1144Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001145It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001146
Akron941c1a62016-02-23 17:41:41 +01001147=over 2
1148
1149=item B<archive>
1150
Akron081639e2017-04-21 19:01:39 +02001151 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001152
Akron2fd402b2016-10-27 21:26:48 +02001153Converts an archive of KorAP-XML documents. It expects a directory
1154(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001155
1156=item B<extract>
1157
Akrona76d8352016-10-27 16:27:32 +02001158 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1159
1160Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001161
Akron63f20d42017-04-10 23:40:29 +02001162=item B<serial>
1163
1164 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1165
1166Convert archives sequentially. The inputs are not merged but treated
1167as they are (so they may be premerged or globs).
1168the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001169are created based on the archive name. In case the C<--to-tar> flag is given,
1170the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001171
1172
Akron941c1a62016-02-23 17:41:41 +01001173=back
1174
1175
1176=head1 OPTIONS
1177
1178=over 2
1179
Akrona76d8352016-10-27 16:27:32 +02001180=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001181
Akrona76d8352016-10-27 16:27:32 +02001182Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001183
Akron7606afa2016-10-25 16:23:49 +02001184Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001185document, while C<archive> expects a KorAP-XML corpus folder or a zip
1186file to batch process multiple files.
1187C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001188
Akrona76d8352016-10-27 16:27:32 +02001189C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001190that the first archive listed contains all primary data files
1191and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001192
Akron7606afa2016-10-25 16:23:49 +02001193 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001194
Akron821db3d2017-04-06 21:19:31 +02001195Input may also be defined using BSD glob wildcards.
1196
1197 -i 'file/news*.zip'
1198
1199The extended input array will be sorted in length order, so the shortest
1200path needs to contain all primary data files and all meta data files.
1201
Akron0c3e3752016-06-28 15:55:53 +02001202(The directory structure follows the base directory format,
1203that may include a C<.> root folder.
1204In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001205need to be passed with a hash sign in front of the archive's name.
1206This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001207
Akron7606afa2016-10-25 16:23:49 +02001208To support zip files, a version of C<unzip> needs to be installed that is
1209compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001210
Akron7606afa2016-10-25 16:23:49 +02001211B<The root folder switch using the hash sign is experimental and
1212may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001213
Akronf73ffb62018-06-27 12:13:59 +02001214
Akron63f20d42017-04-10 23:40:29 +02001215=item B<--input-base|-ib> <directory>
1216
1217The base directory for inputs.
1218
1219
Akron941c1a62016-02-23 17:41:41 +01001220=item B<--output|-o> <directory|file>
1221
1222Output folder for archive processing or
1223document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001224writes to C<STDOUT> by default
1225(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001226
1227=item B<--overwrite|-w>
1228
1229Overwrite files that already exist.
1230
Akronf73ffb62018-06-27 12:13:59 +02001231
Akron3741f8b2016-12-21 19:55:21 +01001232=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001233
1234Define the default tokenization by specifying
1235the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001236of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001237This will directly take the file instead of running
1238the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001239
Akron3741f8b2016-12-21 19:55:21 +01001240
1241=item B<--base-sentences|-bs> <foundry>#<layer>
1242
1243Define the layer for base sentences.
1244If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001245Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1246layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001247
1248 Defaults to unset.
1249
1250
1251=item B<--base-paragraphs|-bp> <foundry>#<layer>
1252
1253Define the layer for base paragraphs.
1254If given, this will be used instead of using C<Base#Paragraphs>.
1255Currently C<DeReKo#Structure> is the only additional layer supported.
1256
1257 Defaults to unset.
1258
1259
Akron41ac10b2017-02-08 22:47:25 +01001260=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1261
1262Define the layer for base pagebreaks.
1263Currently C<DeReKo#Structure> is the only layer supported.
1264
1265 Defaults to unset.
1266
1267
Akron941c1a62016-02-23 17:41:41 +01001268=item B<--skip|-s> <foundry>[#<layer>]
1269
Akronf7ad89e2016-03-16 18:22:47 +01001270Skip specific annotations by specifying the foundry
1271(and optionally the layer with a C<#>-prefix),
1272e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001273Can be set multiple times.
1274
Akronf73ffb62018-06-27 12:13:59 +02001275
Akronc13a1702016-03-15 19:33:14 +01001276=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001277
Akronf7ad89e2016-03-16 18:22:47 +01001278Convert specific annotations by specifying the foundry
1279(and optionally the layer with a C<#>-prefix),
1280e.g. C<Mate> or C<Mate#Morpho>.
1281Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001282
Akronf73ffb62018-06-27 12:13:59 +02001283
Akron941c1a62016-02-23 17:41:41 +01001284=item B<--primary|-p>
1285
Akronc13a1702016-03-15 19:33:14 +01001286Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001287Can be flagged using C<--no-primary> as well.
1288This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001289
Akronf73ffb62018-06-27 12:13:59 +02001290
Akroned9baf02019-01-22 17:03:25 +01001291=item B<--non-word-tokens|-nwt>
1292
1293Tokenize non-word tokens like word tokens (defined as matching
1294C</[\d\w]/>). Useful to treat punctuations as tokens.
1295
1296 Defaults to unset.
1297
Akronf1849aa2019-12-16 23:35:33 +01001298
1299=item B<--non-verbal-tokens|-nvt>
1300
1301Tokenize non-verbal tokens marked as in the primary data as
1302the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1303
1304 Defaults to unset.
1305
1306
Akron941c1a62016-02-23 17:41:41 +01001307=item B<--jobs|-j>
1308
1309Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001310for archive processing.
Akron11c80302016-03-18 19:44:43 +01001311Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001312
1313If C<sequential-extraction> is not set to false, this will
1314also apply to extraction.
1315
Akronc11f7982017-02-21 21:20:14 +01001316Pass -1, and the value will be set automatically to 5
1317times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001318This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001319
Akronf73ffb62018-06-27 12:13:59 +02001320
Akron263274c2019-02-07 09:48:30 +01001321=item B<--koral|-k>
1322
1323Version of the output format. Supported versions are:
1324C<0> for legacy serialization, C<0.03> for serialization
1325with metadata fields as key-values on the root object,
1326C<0.4> for serialization with metadata fields as a list
1327of C<"@type":"koral:field"> objects.
1328
1329Currently defaults to C<0.03>.
1330
1331
Akron9ec88872017-04-12 16:29:06 +02001332=item B<--sequential-extraction|-se>
1333
1334Flag to indicate, if the C<jobs> value also applies to extraction.
1335Some systems may have problems with extracting multiple archives
1336to the same folder at the same time.
1337Can be flagged using C<--no-sequential-extraction> as well.
1338Defaults to C<false>.
1339
Akronf73ffb62018-06-27 12:13:59 +02001340
Akron35db6e32016-03-17 22:42:22 +01001341=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001342
Akron35db6e32016-03-17 22:42:22 +01001343Define the metadata parser to use. Defaults to C<I5>.
1344Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1345This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001346
Akronf73ffb62018-06-27 12:13:59 +02001347
Akron941c1a62016-02-23 17:41:41 +01001348=item B<--pretty|-y>
1349
Akronc13a1702016-03-15 19:33:14 +01001350Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001351This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001352
Akronf73ffb62018-06-27 12:13:59 +02001353
Akron941c1a62016-02-23 17:41:41 +01001354=item B<--gzip|-z>
1355
Akronf7ad89e2016-03-16 18:22:47 +01001356Compress the output.
1357Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001358
Akronf73ffb62018-06-27 12:13:59 +02001359
Akron11c80302016-03-18 19:44:43 +01001360=item B<--cache|-c>
1361
1362File to mmap a cache (using L<Cache::FastMmap>).
1363Defaults to C<korapxml2krill.cache> in the calling directory.
1364
Akronf73ffb62018-06-27 12:13:59 +02001365
Akron11c80302016-03-18 19:44:43 +01001366=item B<--cache-size|-cs>
1367
1368Size of the cache. Defaults to C<50m>.
1369
Akronf73ffb62018-06-27 12:13:59 +02001370
Akron11c80302016-03-18 19:44:43 +01001371=item B<--cache-init|-ci>
1372
1373Initialize cache file.
1374Can be flagged using C<--no-cache-init> as well.
1375Defaults to C<true>.
1376
Akronf73ffb62018-06-27 12:13:59 +02001377
Akron11c80302016-03-18 19:44:43 +01001378=item B<--cache-delete|-cd>
1379
1380Delete cache file after processing.
1381Can be flagged using C<--no-cache-delete> as well.
1382Defaults to C<true>.
1383
Akronf73ffb62018-06-27 12:13:59 +02001384
Akron636aa112017-04-07 18:48:56 +02001385=item B<--config|-cfg>
1386
1387Configure the parameters of your call in a file
1388of key-value pairs with whitespace separator
1389
1390 overwrite 1
1391 token DeReKo#Structure
1392 ...
1393
1394Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001395C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001396C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001397C<output>, C<koral>,
1398C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001399C<base-sentences>, C<base-paragraphs>,
1400C<base-pagebreaks>,
1401C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001402(semicolon separated), C<anno> (semicolon separated).
1403
Akronf73ffb62018-06-27 12:13:59 +02001404Configuration parameters will always be overwritten by
1405passed parameters.
1406
1407
Akron81500102017-04-07 20:45:44 +02001408=item B<--temporary-extract|-te>
1409
1410Only valid for the C<archive> command.
1411
1412This will first extract all files into a
1413directory and then will archive.
1414If the directory is given as C<:temp:>,
1415a temporary directory is used.
1416This is especially useful to avoid
1417massive unzipping and potential
1418network latency.
Akron636aa112017-04-07 18:48:56 +02001419
Akronf73ffb62018-06-27 12:13:59 +02001420
Akronc93a0802019-07-11 15:48:34 +02001421=item B<--to-tar>
1422
1423Only valid for the C<archive> command.
1424
1425Writes the output into a tar archive.
1426
1427
Akrone10ad322016-02-27 10:54:26 +01001428=item B<--sigle|-sg>
1429
Akron20807582016-10-26 17:11:34 +02001430Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001431Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001432I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001433Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001434In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001435On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001436
Akronf73ffb62018-06-27 12:13:59 +02001437
Akron941c1a62016-02-23 17:41:41 +01001438=item B<--log|-l>
1439
1440The L<Log4perl> log level, defaults to C<ERROR>.
1441
Akronf73ffb62018-06-27 12:13:59 +02001442
Akron941c1a62016-02-23 17:41:41 +01001443=item B<--help|-h>
1444
Akron42f48c12020-02-14 13:08:13 +01001445Print help information.
Akron941c1a62016-02-23 17:41:41 +01001446
Akronf73ffb62018-06-27 12:13:59 +02001447
Akron941c1a62016-02-23 17:41:41 +01001448=item B<--version|-v>
1449
1450Print version information.
1451
1452=back
1453
Akronf73ffb62018-06-27 12:13:59 +02001454
Akronc13a1702016-03-15 19:33:14 +01001455=head1 ANNOTATION SUPPORT
1456
1457L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1458developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1459The base foundry with paragraphs, sentences, and the text element are mandatory for
1460L<Krill|https://github.com/KorAP/Krill>.
1461
Akron821db3d2017-04-06 21:19:31 +02001462 Base
1463 #Paragraphs
1464 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001465
Akron821db3d2017-04-06 21:19:31 +02001466 Connexor
1467 #Morpho
1468 #Phrase
1469 #Sentences
1470 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001471
Akron821db3d2017-04-06 21:19:31 +02001472 CoreNLP
1473 #Constituency
1474 #Morpho
1475 #NamedEntities
1476 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001477
Akronce125b62017-06-19 11:54:36 +02001478 CMC
1479 #Morpho
1480
Akron821db3d2017-04-06 21:19:31 +02001481 DeReKo
1482 #Structure
Akronc13a1702016-03-15 19:33:14 +01001483
Akron57510c12019-01-04 14:58:53 +01001484 DGD
1485 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001486 #Structure
Akron57510c12019-01-04 14:58:53 +01001487
Akron821db3d2017-04-06 21:19:31 +02001488 DRuKoLa
1489 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001490
Akron821db3d2017-04-06 21:19:31 +02001491 Glemm
1492 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001493
Akronea1aed52018-07-19 14:43:34 +02001494 HNC
1495 #Morpho
1496
Akron4c679192018-01-16 17:41:49 +01001497 LWC
1498 #Dependency
1499
Akron821db3d2017-04-06 21:19:31 +02001500 Malt
1501 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001502
Akron821db3d2017-04-06 21:19:31 +02001503 MarMoT
1504 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001505
Akron821db3d2017-04-06 21:19:31 +02001506 Mate
1507 #Dependency
1508 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001509
Akron821db3d2017-04-06 21:19:31 +02001510 MDParser
1511 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001512
Akron821db3d2017-04-06 21:19:31 +02001513 OpenNLP
1514 #Morpho
1515 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001516
Akron07e24772020-04-23 14:00:54 +02001517 RWK
1518 #Morpho
1519 #Structure
1520
Akron821db3d2017-04-06 21:19:31 +02001521 Sgbr
1522 #Lemma
1523 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001524
Akron7d5e6382019-08-08 16:36:27 +02001525 Talismane
1526 #Dependency
1527 #Morpho
1528
Akron821db3d2017-04-06 21:19:31 +02001529 TreeTagger
1530 #Morpho
1531 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001532
Akron821db3d2017-04-06 21:19:31 +02001533 XIP
1534 #Constituency
1535 #Morpho
1536 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001537
Akronc13a1702016-03-15 19:33:14 +01001538
1539More importers are in preparation.
1540New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1541See the built-in annotation importers as examples.
1542
Akronf73ffb62018-06-27 12:13:59 +02001543
Akron8f69d632020-01-15 16:58:11 +01001544=head1 About KorAP-XML
1545
1546KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1547data model (Bański et al. 2013), where text data are stored physically
1548separated from their interpretations (i.e. annotations).
1549A text document in KorAP-XML therefore consists of several files
1550containing primary data, metadata and annotations.
1551
1552The structure of a single KorAP-XML document can be as follows:
1553
1554 - data.xml
1555 - header.xml
1556 + base
1557 - tokens.xml
1558 - ...
1559 + struct
1560 - structure.xml
1561 - ...
1562 + corenlp
1563 - morpho.xml
1564 - constituency.xml
1565 - ...
1566 + tree_tagger
1567 - morpho.xml
1568 - ...
1569 - ...
1570
1571The C<data.xml> contains the primary data, the C<header.xml> contains
1572the metadata, and the annotation layers are stored in subfolders
1573like C<base>, C<struct> or C<corenlp>
1574(so-called "foundries"; Bański et al. 2013).
1575
1576Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001577(Lüngen and Sperberg-McQueen 2012). See the documentation in
1578L<KorAP::XML::Meta::I5> for translatable fields.
1579
1580Annotations correspond to a variant of the TEI-P5 feature structures
1581(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001582Annotation feature structures refer to character sequences of the primary text
1583inside the C<text> element of the C<data.xml>.
1584A single annotation containing the lemma of a token can have the following structure:
1585
1586 <span from="0" to="3">
1587 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1588 <f name="lex">
1589 <fs>
1590 <f name="lemma">zum</f>
1591 </fs>
1592 </f>
1593 </fs>
1594 </span>
1595
1596The C<from> and C<to> attributes are refering to the character span
1597in the primary text.
1598Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1599the structure may vary. See L<KorAP::XML::Annotation::*> for various
1600annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001601
1602Multiple KorAP-XML documents are organized on three levels following
1603the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1604corpus E<gt> document E<gt> text. On each level metadata information
1605can be stored, that C<korapxml2krill> will merge to a single metadata
1606object per text. A corpus is therefore structured as follows:
1607
1608 + <corpus>
1609 - header.xml
1610 + <document>
1611 - header.xml
1612 + <text>
1613 - data.xml
1614 - header.xml
1615 - ...
1616 - ...
1617
1618A single text can be identified by the concatenation of
1619the corpus identifier, the document identifier and the text identifier.
1620This identifier is called the text sigle
1621(e.g. a text with the identifier C<18486> in the document C<060> in the
1622corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1623
1624These corpora are often stored in zip files, with which C<korapxml2krill>
1625can deal with. Corpora may also be split in multiple zip archives
1626(e.g. one zip file per foundry), which is also supported (see C<--input>).
1627
1628Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1629in form of a test suite.
1630The resulting JSON format merges all annotation layers
1631based on a single token stream.
1632
1633=head2 References
1634
1635Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1636KorAP data model: first approximation, December.
1637
1638Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1639"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1640Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1641L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1642
1643Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1644"Robust corpus architecture: a new look at virtual collections and data access",
1645Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1646L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1647
1648Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1649Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1650"Towards an international standard on featurestructure representation",
1651Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1652pp. 373-376.
1653L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1654
1655Harald Lüngen and C. M. Sperberg-McQueen (2012):
1656"A TEI P5 Document Grammar for the IDS Text Model",
1657Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1658L<PDF|https://journals.openedition.org/jtei/pdf/508>
1659
1660TEI Consortium, eds:
1661"Feature Structures",
1662Guidelines for Electronic Text Encoding and Interchange.
1663L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1664
Akron941c1a62016-02-23 17:41:41 +01001665=head1 AVAILABILITY
1666
1667 https://github.com/KorAP/KorAP-XML-Krill
1668
1669
1670=head1 COPYRIGHT AND LICENSE
1671
Akron8f69d632020-01-15 16:58:11 +01001672Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001673
Akron8f69d632020-01-15 16:58:11 +01001674Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001675
Akrona76d8352016-10-27 16:27:32 +02001676Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001677
1678L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1679Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001680L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001681member of the
Akronf1849aa2019-12-16 23:35:33 +01001682L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001683
1684This program is free software published under the
1685L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1686
1687=cut