blob: f1ba9d4fab8caabefdfa8a2aba475d6b0b6cec3c [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
134# - Support for non-word tokens.
Akron941c1a62016-02-23 17:41:41 +0100135# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100136
Akroned9baf02019-01-22 17:03:25 +0100137our $LAST_CHANGE = '2019/01/22';
Akron941c1a62016-02-23 17:41:41 +0100138our $LOCAL = $FindBin::Bin;
139our $VERSION_MSG = <<"VERSION";
140Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
141VERSION
142
Akron63f20d42017-04-10 23:40:29 +0200143# Prototypes
144sub get_file_name_from_glob($);
145sub get_file_name($);
146
Akron941c1a62016-02-23 17:41:41 +0100147# Parse comand
148my $cmd;
149our @ARGV;
150if ($ARGV[0] && index($ARGV[0], '-') != 0) {
151 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100152};
Akron63f20d42017-04-10 23:40:29 +0200153my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100154
Akron5f51d422016-08-16 16:26:43 +0200155my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100156my $text;
Akrone10ad322016-02-27 10:54:26 +0100157
Akron941c1a62016-02-23 17:41:41 +0100158# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000159GetOptions(
Akron08385f62016-03-22 20:37:04 +0100160 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200161 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100162 'output|o=s' => \(my $output),
163 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100164 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200165 'token|t=s' => \(my $token_base),
166 'base-sentences|bs=s' => \(my $base_sentences),
167 'base-paragraphs|bp=s' => \(my $base_paragraphs),
168 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100169 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200170 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100171 'skip|s=s' => \@skip,
172 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200173 'cache|c=s' => \(my $cache_file),
174 'config|cfg=s' => \(my $cfg_file),
175 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200176 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100177 'primary|p!' => \(my $primary),
178 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200179 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200180 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100181 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200182 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200183 'cache-size|cs=s' => \(my $cache_size),
184 'cache-delete|cd!' => \(my $cache_delete),
185 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100186 'help|h' => sub {
187 pod2usage(
188 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200189 -verbose => 99,
190 -msg => $VERSION_MSG,
191 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100192 );
193 },
194 'version|v' => sub {
195 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200196 -verbose => 0,
197 -msg => $VERSION_MSG,
198 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100199 )
200 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000201);
202
Akron63f20d42017-04-10 23:40:29 +0200203
Akron636aa112017-04-07 18:48:56 +0200204# Load from configuration
205if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200206 my %config;
207
208 Config::Simple->import_from($cfg_file, \%config);
209
210 # Overwrite
211 if (!defined($overwrite) && defined $config{overwrite}) {
212 $overwrite = $config{overwrite};
213 };
214
215 # Gzip
216 if (!defined($gzip) && defined $config{gzip}) {
217 $gzip = $config{gzip};
218 };
219
220 # Jobs
221 if (!defined($jobs) && defined $config{jobs}) {
222 $jobs = $config{jobs};
223 };
224
Akron63f20d42017-04-10 23:40:29 +0200225 # Input root base directory
226 if (!defined($input_base) && defined $config{'input-base'}) {
227 $input_base = $config{'input-base'};
228 };
229
Akron81500102017-04-07 20:45:44 +0200230 # temporary-extract
231 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
232 $extract_dir = $config{'temporary-extract'};
233 };
234
Akron636aa112017-04-07 18:48:56 +0200235 # Token base
236 if (!defined($token_base) && defined $config{token}) {
237 $token_base = $config{token};
238 };
239
Akroned9baf02019-01-22 17:03:25 +0100240 # temporary-extract
241 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
242 $non_word_tokens = $config{'non-word-tokens'};
243 };
244
Akron636aa112017-04-07 18:48:56 +0200245 # Cache file
246 if (!defined($cache_file) && defined $config{cache}) {
247 $cache_file = $config{cache};
248 };
249
250 # Cache size
251 if (!defined($cache_size) && defined $config{'cache-size'}) {
252 $cache_size = $config{'cache-size'};
253 };
254
255 # Cache delete
256 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
257 $cache_delete = $config{'cache-delete'} ;
258 };
259
260 # Cache init
261 if (!(defined $cache_init) && defined $config{'cache-init'}) {
262 $cache_init = $config{'cache-init'} ;
263 };
264
Akron9ec88872017-04-12 16:29:06 +0200265 # Jobs for extraction
266 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
267 $sequential_extraction = $config{'sequential-extraction'} ;
268 };
269
Akron636aa112017-04-07 18:48:56 +0200270 # Meta
271 if (!(defined $meta) && defined $config{'meta'}) {
272 $meta = $config{'meta'} ;
273 };
274
275 # Output
276 if (!(defined $output) && defined $config{'output'}) {
277 $output = $config{'output'} ;
278 };
279
280 # Base-sentences
281 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
282 $base_sentences = $config{'base-sentences'} ;
283 };
284
285 # Base-paragraphs
286 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
287 $base_paragraphs = $config{'base-paragraphs'} ;
288 };
289
290 # Base-pagebreaks
291 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
292 $base_pagebreaks = $config{'base-pagebreaks'} ;
293 };
294
Akron081639e2017-04-21 19:01:39 +0200295 # Write to tar
296 if (!(defined $to_tar) && defined $config{'to-tar'}) {
297 $to_tar = $config{'to-tar'} ;
298 };
299
Akron636aa112017-04-07 18:48:56 +0200300 # Log
301 if (!(defined $log_level) && defined $config{'log'}) {
302 $log_level = $config{'log'} ;
303 };
304
305 # Skip
306 if (!scalar(@skip) && defined $config{'skip'}) {
307 @skip = split /\s*;\s*/, $config{'skip'} ;
308 };
309
310 # Sigle
311 if (!scalar(@sigle) && defined $config{'sigle'}) {
312 @sigle = split /\s*;\s*/, $config{'sigle'} ;
313 };
314
315 # Anno
316 if (!scalar(@anno) && defined $config{'anno'}) {
317 @anno = split /\s*;\s*/, $config{'anno'} ;
318 };
319};
320
Akron63f20d42017-04-10 23:40:29 +0200321
Akron636aa112017-04-07 18:48:56 +0200322# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200323$token_base //= 'OpenNLP#tokens';
324$cache_file //= 'korapxml2krill.cache';
325$cache_size //= '50m';
326$jobs //= 0;
327$cache_delete //= 1;
328$cache_init //= 1;
329$sequential_extraction //= 0;
330$log_level //= 'ERROR';
331$base_sentences //= '';
332$base_paragraphs //= '';
333$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100334$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200335
Akron821db3d2017-04-06 21:19:31 +0200336$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100337$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100338$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100339
Akron63f20d42017-04-10 23:40:29 +0200340
341# Initialize log4perl object
342Log::Log4perl->init({
343 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
344 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
345 'log4perl.appender.STDERR.layout' => 'PatternLayout',
346 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
347});
348
349my $log = Log::Log4perl->get_logger('main');
350
351
352print "Reading config from $cfg_file\n" if $cfg_file;
353
354
Akron941c1a62016-02-23 17:41:41 +0100355my %ERROR_HASH = (
356 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200357 -verbose => 99,
358 -msg => $VERSION_MSG,
359 -output => '-',
360 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100361);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000362
Akron941c1a62016-02-23 17:41:41 +0100363# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100364pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000365
Akrone1dbc382016-07-08 22:24:52 +0200366# Gzip has no effect, if no output is given
367pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000368
Akronc11f7982017-02-21 21:20:14 +0100369
Akron636aa112017-04-07 18:48:56 +0200370if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100371 state $cores = Sys::Info->new->device('CPU')->count;
372 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200373 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100374};
375
Akron821db3d2017-04-06 21:19:31 +0200376
Akron63f20d42017-04-10 23:40:29 +0200377# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200378if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200379
Akron486f9ab2017-04-22 23:25:19 +0200380 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200381 $log->error("Directory '$output' does not exist.");
382 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200383 };
384
385 # Remove all inputs
386 my $remove_next = 0;
387 @keep_argv = @{c(@keep_argv)->grep(
388 sub {
389 # Input flag
390 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
391 $remove_next = 1;
392 return 0;
393 }
394
395 # input value
396 elsif ($remove_next) {
397 $remove_next = 0;
398 return 0;
399 };
400
401 # Pass parameter
402 return 1;
403 }
404 )->to_array};
405
406
407 # Iterate over all inputs
408 foreach (@input) {
409
Akron081639e2017-04-21 19:01:39 +0200410 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200411 my $new_out = catdir($output, get_file_name_from_glob($_));
412
Akron486f9ab2017-04-22 23:25:19 +0200413 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200414 unless ($to_tar) {
415 if (make_path($new_out) == 0 && !-d $new_out) {
416 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200417 exit 1;
Akron081639e2017-04-21 19:01:39 +0200418 };
Akron63f20d42017-04-10 23:40:29 +0200419 };
420
421 # Create archive command
422 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
423 print "Start serial processing of $_ to $new_out\n";
424
425 # Start archiving
426 system @archive_cmd;
427 };
428
Akron3abc03e2017-06-29 16:23:35 +0200429 exit;
Akron63f20d42017-04-10 23:40:29 +0200430};
431
Akrone1dbc382016-07-08 22:24:52 +0200432my %skip;
433$skip{lc($_)} = 1 foreach @skip;
434
435my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100436push(@layers, ['Base', 'Sentences']) unless $base_sentences;
437push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200438
439# Connexor
440push(@layers, ['Connexor', 'Morpho']);
441push(@layers, ['Connexor', 'Syntax']);
442push(@layers, ['Connexor', 'Phrase']);
443push(@layers, ['Connexor', 'Sentences']);
444
445# CoreNLP
446push(@layers, ['CoreNLP', 'NamedEntities']);
447push(@layers, ['CoreNLP', 'Sentences']);
448push(@layers, ['CoreNLP', 'Morpho']);
449push(@layers, ['CoreNLP', 'Constituency']);
450
Akronce125b62017-06-19 11:54:36 +0200451# CMC
452push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100453
Akrone1dbc382016-07-08 22:24:52 +0200454# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100455my @dereko_attr = ();
456if ($base_sentences eq 'dereko#structure') {
457 push @dereko_attr, 'sentences';
458};
459if ($base_paragraphs eq 'dereko#structure') {
460 push @dereko_attr, 'paragraphs';
461};
Akron636bd9c2017-02-09 17:13:00 +0100462
Akron41ac10b2017-02-08 22:47:25 +0100463if ($base_pagebreaks eq 'dereko#structure') {
464 push @dereko_attr, 'pagebreaks';
465};
466
467if ($dereko_attr[0]) {
468 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100469}
470else {
471 push(@layers, ['DeReKo', 'Structure']);
472};
Akrone1dbc382016-07-08 22:24:52 +0200473
474# Glemm
475push(@layers, ['Glemm', 'Morpho']);
476
Akronea1aed52018-07-19 14:43:34 +0200477# HNC
478push(@layers, ['HNC', 'Morpho']);
479
Akron4c679192018-01-16 17:41:49 +0100480# LWC
481push(@layers, ['LWC', 'Dependency']);
482
Akrone1dbc382016-07-08 22:24:52 +0200483# Malt
484push(@layers, ['Malt', 'Dependency']);
485
486# MDParser
487push(@layers, ['MDParser', 'Dependency']);
488
489# Mate
490push(@layers, ['Mate', 'Morpho']);
491push(@layers, ['Mate', 'Dependency']);
492
493# OpenNLP
494push(@layers, ['OpenNLP', 'Morpho']);
495push(@layers, ['OpenNLP', 'Sentences']);
496
497# Schreibgebrauch
498push(@layers, ['Sgbr', 'Lemma']);
499push(@layers, ['Sgbr', 'Morpho']);
500
501# TreeTagger
502push(@layers, ['TreeTagger', 'Morpho']);
503push(@layers, ['TreeTagger', 'Sentences']);
504
505# XIP
506push(@layers, ['XIP', 'Morpho']);
507push(@layers, ['XIP', 'Constituency']);
508push(@layers, ['XIP', 'Sentences']);
509push(@layers, ['XIP', 'Dependency']);
510
Akron4fa37c32017-01-20 14:43:10 +0100511# DRuKoLa
512push(@layers, ['DRuKoLa', 'Morpho']);
513
Akron3bd942f2017-02-20 20:09:14 +0100514# Marmot
515push(@layers, ['MarMoT', 'Morpho']);
516
Akron4fa37c32017-01-20 14:43:10 +0100517
Akrone1dbc382016-07-08 22:24:52 +0200518# Check filters
519my @filtered_anno;
520if ($skip{'#all'}) {
521 foreach (@anno) {
522 push @filtered_anno, [ split('#', $_) ];
523 };
524}
525
526# Add all annotations that are not skipped
527else {
528 # Add to index file - respect skipping
529 foreach my $info (@layers) {
530 # Skip if Foundry or Foundry#Layer should be skipped
531 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
532 push @filtered_anno, $info;
533 };
534 };
535};
536
537# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200538my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
539
540# Remove file extension
541$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200542
543# TODO: This should not be initialized for batch
544my $cache = Cache::FastMmap->new(
545 share_file => $cache_file,
546 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200547 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200548);
549
Akron03b24db2016-08-16 20:54:32 +0200550# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200551my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200552 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200553 meta_type => $meta,
554 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200555 foundry => $token_base_foundry,
556 layer => $token_base_layer,
557 gzip => $gzip,
558 log => $log,
559 primary => $primary,
560 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100561 anno => \@filtered_anno,
562 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200563);
564
Akron941c1a62016-02-23 17:41:41 +0100565# Get file name based on path information
566sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100567 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200568 if (-d $i) {
569 $i =~ s![^\/]+$!!;
570 };
Akron941c1a62016-02-23 17:41:41 +0100571 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200572
573 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200574 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100575 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100576 $file =~ tr/\//-/;
577 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200578 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100579 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000580};
581
Akron63f20d42017-04-10 23:40:29 +0200582
583sub get_file_name_from_glob ($) {
584 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200585 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200586 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
587 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
588 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
589 $glob =~ s/^-//; # Clean beginning
590 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200591 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200592 return $glob;
593};
594
595
Akrone10ad322016-02-27 10:54:26 +0100596# Convert sigle to path construct
597s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
598
Akron7d4cdd82016-08-17 21:39:45 +0200599if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200600 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200601 $log->error("Directory '$output' does not exist.");
602 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200603 };
604};
605
Akron63f20d42017-04-10 23:40:29 +0200606
607# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200608if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200609
Akron821db3d2017-04-06 21:19:31 +0200610 my @new_input = ();
611
612 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200613 foreach my $wild_card (@input) {
614
615 # Prefix with input root
616 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
617
618 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200619 };
620
Akron63f20d42017-04-10 23:40:29 +0200621 # Sort files by length
622 @input = sort { length($a) <=> length($b) } @new_input;
623
624 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200625};
626
627
Akron941c1a62016-02-23 17:41:41 +0100628# Process a single file
629unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100630 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000631
Akron941c1a62016-02-23 17:41:41 +0100632 BEGIN {
633 $main::TIME = Benchmark->new;
634 $main::LAST_STOP = Benchmark->new;
635 };
636
637 sub stop_time {
638 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200639 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100640 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200641 timestr(timediff($new, $main::LAST_STOP)) .
642 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
643 );
Akron941c1a62016-02-23 17:41:41 +0100644 $main::LAST_STOP = $new;
645 };
646
647 # Create and parse new document
648 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100649
Akron7d4cdd82016-08-17 21:39:45 +0200650 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200651 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100652
Akron11c80302016-03-18 19:44:43 +0100653 # Delete cache file
654 unlink($cache_file) if $cache_delete;
655
Akron5f51d422016-08-16 16:26:43 +0200656 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200657 exit;
Akron81500102017-04-07 20:45:44 +0200658};
659
Nils Diewald59094f22014-11-05 18:20:50 +0000660
Akrone10ad322016-02-27 10:54:26 +0100661# Extract XML files
Akron81500102017-04-07 20:45:44 +0200662if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100663
Akrond5643ad2017-07-04 20:27:13 +0200664 # Output is required
665 pod2usage(%ERROR_HASH) unless $output;
666
Akron7d4cdd82016-08-17 21:39:45 +0200667 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200668 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100669
Akron7d4cdd82016-08-17 21:39:45 +0200670 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100671 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200672 $log->error("Unzip is not installed or incompatible.");
673 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100674 };
675
Akronb0c88db2016-06-29 16:33:18 +0200676 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200677 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200678
Akron651cb8d2016-08-16 21:44:49 +0200679 my $prefix = 1;
680
Akron03b24db2016-08-16 20:54:32 +0200681 # No sigles given
682 unless (@sigle) {
683
684 # Get files
685 foreach ($archive->list_texts) {
686
687 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200688 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200689
690 # TODO: Make this OS independent
691 push @sigle, join '/', $corpus, $doc, $text;
692 };
Akron20807582016-10-26 17:11:34 +0200693 }
694
695 # Check sigle for doc sigles
696 else {
697 my @new_sigle;
698
699 my $prefix_check = 0;
700
701 # Iterate over all sigle
702 foreach (@sigle) {
703
704 # Sigle is a doc sigle
705 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200706
Akron60a8caa2017-02-17 21:51:27 +0100707 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200708 # Check if a prefix is needed
709 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100710
711 if ($prefix = $archive->check_prefix) {
712 print " with prefix ...";
713 };
Akron20807582016-10-26 17:11:34 +0200714 $prefix_check = 1;
715 };
716
Akron60a8caa2017-02-17 21:51:27 +0100717 print "\n";
718
Akron20807582016-10-26 17:11:34 +0200719 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200720 my $path = ($prefix ? './' : '') . $_;
721
722 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200723 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200724 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200725 ) ? '' : 'not '
726 );
727 print "extracted.\n";
728 }
Akron60a8caa2017-02-17 21:51:27 +0100729
730 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200731 else {
732 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100733
734 unless ($prefix_check) {
735
736 if ($prefix = $archive->check_prefix) {
737 print " with prefix ...";
738 };
739 $prefix_check = 1;
740 };
Akron20807582016-10-26 17:11:34 +0200741 };
742 };
743 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200744 };
745
Akrone10ad322016-02-27 10:54:26 +0100746 # Iterate over all given sigles and extract
747 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100748
Akron2812ba22016-10-28 21:55:59 +0200749 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200750
Akron03b24db2016-08-16 20:54:32 +0200751 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200752 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100753
Akron20807582016-10-26 17:11:34 +0200754 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200755 ($prefix ? './' : '') . $_, $output
756 ) ? '' : 'not '
757 );
Akrone10ad322016-02-27 10:54:26 +0100758 print "extracted.\n";
759 };
Akronb0c88db2016-06-29 16:33:18 +0200760 }
Akron7d4cdd82016-08-17 21:39:45 +0200761
762 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200763 else {
764 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200765 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100766 };
767}
768
Akron81500102017-04-07 20:45:44 +0200769
Akron941c1a62016-02-23 17:41:41 +0100770# Process an archive
771elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000772
Akron81500102017-04-07 20:45:44 +0200773 my $archive_output;
774
775 # First extract, then archive
776 if (defined $extract_dir) {
777
778 # Create new archive object
779 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
780
781 # Check zip capabilities
782 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200783 $log->error("Unzip is not installed or incompatible.");
784 exit 1;
Akron81500102017-04-07 20:45:44 +0200785 };
786
787 # Add further annotation archived
788 $archive->attach($_) foreach @input[1..$#input];
789
790 # Create a temporary directory
791 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200792 $extract_dir = tempdir(CLEANUP => 0);
793 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200794 };
795
Akron63f20d42017-04-10 23:40:29 +0200796 # Add some random extra to avoid clashes with multiple archives
797 $extract_dir = catdir($extract_dir, random_string('cccccc'));
798
799 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200800 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200801 @input = ($extract_dir);
802 }
803 else {
804 $log->error('Unable to extract from primary archive ' . $input[0] .
805 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200806 exit 1;
Akron81500102017-04-07 20:45:44 +0200807 };
808 }
809
810 # Can't create archive object
811 else {
812 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200813 exit 1;
Akron81500102017-04-07 20:45:44 +0200814 };
815 };
816
Akrone1dbc382016-07-08 22:24:52 +0200817 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100818
Akron7d4cdd82016-08-17 21:39:45 +0200819 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100820 my $pool = Parallel::ForkManager->new($jobs);
821
Akron7d4cdd82016-08-17 21:39:45 +0200822 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100823 my $iter = 1; # Current text in process
824
Akronda3097e2017-04-23 19:53:57 +0200825 my $tar_archive;
826 my $output_dir = $output;
827 my $tar_fh;
828
829 # Initialize tar archive
830 if ($to_tar) {
831 $tar_archive = Archive::Tar::Builder->new(
832 ignore_errors => 1
833 );
834
835 # Set output name
836 my $tar_file = $output;
837 unless ($tar_file =~ /\.tar$/) {
838 $tar_file .= '.tar';
839 };
840
841 # Initiate the tar file
842 print "Writing to file $tar_file\n";
843 $tar_fh = IO::File->new($tar_file, 'w');
844 $tar_fh->binmode(1);
845
846 # Set handle
847 $tar_archive->set_handle($tar_fh);
848
849 # Output to temporary directory
850 $output_dir = File::Temp->newdir;
851 };
852
Akron941c1a62016-02-23 17:41:41 +0100853 # Report on fork message
854 $pool->run_on_finish (
855 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200856 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100857 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200858
Akron08385f62016-03-22 20:37:04 +0100859 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200860 ($iter++) . "/$count]" .
861 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200862 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200863
864 if (!$code && $to_tar && $data->[2]) {
865 my $filename = $data->[2];
866
867 # Lock filehandle
868 if (flock($tar_fh, LOCK_EX)) {
869
Akron9a062ce2017-07-04 19:12:05 +0200870 my $clean_file = fileparse($filename);
871
Akronda3097e2017-04-23 19:53:57 +0200872 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200873 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200874 unlink $filename;
875
876 # Unlock filehandle
877 flock($tar_fh, LOCK_UN);
878 }
879 else {
880 $log->warn("Unable to add $filename to archive");
881 };
882 };
883
Akron4c0cf312016-10-15 16:42:09 +0200884 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100885 }
886 );
887
888 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200889 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100890 print "Reading data ...\n";
891
Akron7d4cdd82016-08-17 21:39:45 +0200892 # unless (Cache::FastMmap->new(
893 # share_file => $cache_file,
894 # cache_size => $cache_size,
895 # init_file => $cache_init
896 # )) {
897 # print "Unable to intialize cache '$cache_file'\n\n";
898 # exit(1);
899 # };
Akron11c80302016-03-18 19:44:43 +0100900
Akron486f9ab2017-04-22 23:25:19 +0200901
Akron941c1a62016-02-23 17:41:41 +0100902 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100903 if (-d $input[0]) {
904 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100905 my @dirs;
906 my $dir;
907
Akron7d4cdd82016-08-17 21:39:45 +0200908 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100909 while (1) {
910 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200911 push @dirs, $dir;
912 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100913 };
914 last unless $it->next;
915 };
916
917 print "Start processing ...\n";
918 $t = Benchmark->new;
919 $count = scalar @dirs;
920
921 DIRECTORY_LOOP:
922 for (my $i = 0; $i < $count; $i++) {
923
Akrone1dbc382016-07-08 22:24:52 +0200924 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200925 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200926 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200927 );
Akron941c1a62016-02-23 17:41:41 +0100928
929 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200930 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200931
Akron13d56622016-10-31 14:54:49 +0100932 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200933 $pool->finish(
934 0,
Akronda3097e2017-04-23 19:53:57 +0200935 [
936 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
937 undef,
938 $filename
939 ]
Akron486f9ab2017-04-22 23:25:19 +0200940 );
Akron3ec48972016-08-17 23:24:52 +0200941 }
942 else {
Akron4c0cf312016-10-15 16:42:09 +0200943 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200944 };
Akron941c1a62016-02-23 17:41:41 +0100945 };
946 }
947
948 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200949 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200950
Akron941c1a62016-02-23 17:41:41 +0100951 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200952 $log->error("Unzip is not installed or incompatible.");
953 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100954 };
955
Akron08385f62016-03-22 20:37:04 +0100956 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200957 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100958
Akron941c1a62016-02-23 17:41:41 +0100959 print "Start processing ...\n";
960 $t = Benchmark->new;
961 my @dirs = $archive->list_texts;
962 $count = scalar @dirs;
963
964 ARCHIVE_LOOP:
965 for (my $i = 0; $i < $count; $i++) {
966
967 # Split path information
968 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
969
Akrone1dbc382016-07-08 22:24:52 +0200970 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200971 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200972 get_file_name(
973 catfile($corpus, $doc, $text)
974 . '.json' . ($gzip ? '.gz' : '')
975 )
Akrone1dbc382016-07-08 22:24:52 +0200976 );
Akron941c1a62016-02-23 17:41:41 +0100977
978 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200979 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100980
Akron4c0cf312016-10-15 16:42:09 +0200981 # Create temporary file
982 $temp = File::Temp->newdir;
983
Akronbdf434a2016-10-24 17:42:07 +0200984 # TODO: Check if $filename exist at the beginning,
985 # because extraction can be horrible slow!
986
Akron941c1a62016-02-23 17:41:41 +0100987 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200988 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100989
Akron7d4cdd82016-08-17 21:39:45 +0200990 # Create corpus directory
991 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100992
Akron7d4cdd82016-08-17 21:39:45 +0200993 # Temporary directory
994 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100995
Akron7d4cdd82016-08-17 21:39:45 +0200996 # Write file
Akron13d56622016-10-31 14:54:49 +0100997 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200998
Akron4c0cf312016-10-15 16:42:09 +0200999 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +01001000 $pool->finish(
1001 0,
Akronda3097e2017-04-23 19:53:57 +02001002 [
1003 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
1004 $temp,
1005 $filename
1006 ]
Akron13d56622016-10-31 14:54:49 +01001007 );
1008 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001009 }
1010 else {
Akron4c0cf312016-10-15 16:42:09 +02001011 # Delete temporary file
1012 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001013 };
Akron941c1a62016-02-23 17:41:41 +01001014 }
Akron7d4cdd82016-08-17 21:39:45 +02001015
1016 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001017 else {
Akron4c0cf312016-10-15 16:42:09 +02001018 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001019 };
1020 };
1021 }
1022
1023 else {
1024 print "Input is neither a directory nor an archive.\n\n";
1025 };
1026
1027 $pool->wait_all_children;
1028
Akron11c80302016-03-18 19:44:43 +01001029 # Delete cache file
1030 unlink($cache_file) if $cache_delete;
1031
Akronda3097e2017-04-23 19:53:57 +02001032 # Close tar filehandle
1033 if ($to_tar && $tar_fh) {
1034 $tar_archive->finish;
1035 $tar_fh->close;
1036 print "Wrote to tar archive.\n";
1037 };
1038
Akron63f20d42017-04-10 23:40:29 +02001039 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001040 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001041};
Akron941c1a62016-02-23 17:41:41 +01001042
Nils Diewald2db9ad02013-10-29 19:26:43 +00001043
Akron63f20d42017-04-10 23:40:29 +02001044# Cleanup temporary extraction directory
1045if ($extract_dir) {
1046 my $objects = remove_tree($extract_dir, { safe => 1 });
1047 print "Removed directory $extract_dir with $objects objects.\n";
1048};
1049
1050
1051print "\n";
1052
Nils Diewald2db9ad02013-10-29 19:26:43 +00001053__END__
Akron941c1a62016-02-23 17:41:41 +01001054
1055=pod
1056
1057=encoding utf8
1058
1059=head1 NAME
1060
Akronf7ad89e2016-03-16 18:22:47 +01001061korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001062
1063
1064=head1 SYNOPSIS
1065
Akrona76d8352016-10-27 16:27:32 +02001066 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001067
Akron2fd402b2016-10-27 21:26:48 +02001068
Akron941c1a62016-02-23 17:41:41 +01001069=head1 DESCRIPTION
1070
1071L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1072compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001073The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001074
1075
1076=head1 INSTALLATION
1077
1078The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1079
Akronaf386982016-10-12 00:33:25 +02001080 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001081
Akronc13a1702016-03-15 19:33:14 +01001082In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001083be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001084Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001085In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001086
1087=head1 ARGUMENTS
1088
Akrona76d8352016-10-27 16:27:32 +02001089 $ korapxml2krill -z --input <directory> --output <filename>
1090
1091Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001092It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001093
Akron941c1a62016-02-23 17:41:41 +01001094=over 2
1095
1096=item B<archive>
1097
Akron081639e2017-04-21 19:01:39 +02001098 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001099
Akron2fd402b2016-10-27 21:26:48 +02001100Converts an archive of KorAP-XML documents. It expects a directory
1101(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001102
1103=item B<extract>
1104
Akrona76d8352016-10-27 16:27:32 +02001105 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1106
1107Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001108
Akron63f20d42017-04-10 23:40:29 +02001109=item B<serial>
1110
1111 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1112
1113Convert archives sequentially. The inputs are not merged but treated
1114as they are (so they may be premerged or globs).
1115the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001116are created based on the archive name. In case the C<--to-tar> flag is given,
1117the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001118
1119
Akron941c1a62016-02-23 17:41:41 +01001120=back
1121
1122
1123=head1 OPTIONS
1124
1125=over 2
1126
Akrona76d8352016-10-27 16:27:32 +02001127=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001128
Akrona76d8352016-10-27 16:27:32 +02001129Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001130
Akron7606afa2016-10-25 16:23:49 +02001131Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001132document, while C<archive> expects a KorAP-XML corpus folder or a zip
1133file to batch process multiple files.
1134C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001135
Akrona76d8352016-10-27 16:27:32 +02001136C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001137that the first archive listed contains all primary data files
1138and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001139
Akron7606afa2016-10-25 16:23:49 +02001140 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001141
Akron821db3d2017-04-06 21:19:31 +02001142Input may also be defined using BSD glob wildcards.
1143
1144 -i 'file/news*.zip'
1145
1146The extended input array will be sorted in length order, so the shortest
1147path needs to contain all primary data files and all meta data files.
1148
Akron0c3e3752016-06-28 15:55:53 +02001149(The directory structure follows the base directory format,
1150that may include a C<.> root folder.
1151In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001152need to be passed with a hash sign in front of the archive's name.
1153This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001154
Akron7606afa2016-10-25 16:23:49 +02001155To support zip files, a version of C<unzip> needs to be installed that is
1156compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001157
Akron7606afa2016-10-25 16:23:49 +02001158B<The root folder switch using the hash sign is experimental and
1159may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001160
Akronf73ffb62018-06-27 12:13:59 +02001161
Akron63f20d42017-04-10 23:40:29 +02001162=item B<--input-base|-ib> <directory>
1163
1164The base directory for inputs.
1165
1166
Akron941c1a62016-02-23 17:41:41 +01001167=item B<--output|-o> <directory|file>
1168
1169Output folder for archive processing or
1170document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001171writes to C<STDOUT> by default
1172(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001173
1174=item B<--overwrite|-w>
1175
1176Overwrite files that already exist.
1177
Akronf73ffb62018-06-27 12:13:59 +02001178
Akron3741f8b2016-12-21 19:55:21 +01001179=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001180
1181Define the default tokenization by specifying
1182the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001183of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001184
Akron3741f8b2016-12-21 19:55:21 +01001185
1186=item B<--base-sentences|-bs> <foundry>#<layer>
1187
1188Define the layer for base sentences.
1189If given, this will be used instead of using C<Base#Sentences>.
1190Currently C<DeReKo#Structure> is the only additional layer supported.
1191
1192 Defaults to unset.
1193
1194
1195=item B<--base-paragraphs|-bp> <foundry>#<layer>
1196
1197Define the layer for base paragraphs.
1198If given, this will be used instead of using C<Base#Paragraphs>.
1199Currently C<DeReKo#Structure> is the only additional layer supported.
1200
1201 Defaults to unset.
1202
1203
Akron41ac10b2017-02-08 22:47:25 +01001204=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1205
1206Define the layer for base pagebreaks.
1207Currently C<DeReKo#Structure> is the only layer supported.
1208
1209 Defaults to unset.
1210
1211
Akron941c1a62016-02-23 17:41:41 +01001212=item B<--skip|-s> <foundry>[#<layer>]
1213
Akronf7ad89e2016-03-16 18:22:47 +01001214Skip specific annotations by specifying the foundry
1215(and optionally the layer with a C<#>-prefix),
1216e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001217Can be set multiple times.
1218
Akronf73ffb62018-06-27 12:13:59 +02001219
Akronc13a1702016-03-15 19:33:14 +01001220=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001221
Akronf7ad89e2016-03-16 18:22:47 +01001222Convert specific annotations by specifying the foundry
1223(and optionally the layer with a C<#>-prefix),
1224e.g. C<Mate> or C<Mate#Morpho>.
1225Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001226
Akronf73ffb62018-06-27 12:13:59 +02001227
Akron941c1a62016-02-23 17:41:41 +01001228=item B<--primary|-p>
1229
Akronc13a1702016-03-15 19:33:14 +01001230Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001231Can be flagged using C<--no-primary> as well.
1232This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001233
Akronf73ffb62018-06-27 12:13:59 +02001234
Akroned9baf02019-01-22 17:03:25 +01001235=item B<--non-word-tokens|-nwt>
1236
1237Tokenize non-word tokens like word tokens (defined as matching
1238C</[\d\w]/>). Useful to treat punctuations as tokens.
1239
1240 Defaults to unset.
1241
Akron941c1a62016-02-23 17:41:41 +01001242=item B<--jobs|-j>
1243
1244Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001245for archive processing.
Akron11c80302016-03-18 19:44:43 +01001246Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001247
1248If C<sequential-extraction> is not set to false, this will
1249also apply to extraction.
1250
Akronc11f7982017-02-21 21:20:14 +01001251Pass -1, and the value will be set automatically to 5
1252times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001253This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001254
Akronf73ffb62018-06-27 12:13:59 +02001255
Akron9ec88872017-04-12 16:29:06 +02001256=item B<--sequential-extraction|-se>
1257
1258Flag to indicate, if the C<jobs> value also applies to extraction.
1259Some systems may have problems with extracting multiple archives
1260to the same folder at the same time.
1261Can be flagged using C<--no-sequential-extraction> as well.
1262Defaults to C<false>.
1263
Akronf73ffb62018-06-27 12:13:59 +02001264
Akron35db6e32016-03-17 22:42:22 +01001265=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001266
Akron35db6e32016-03-17 22:42:22 +01001267Define the metadata parser to use. Defaults to C<I5>.
1268Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1269This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001270
Akronf73ffb62018-06-27 12:13:59 +02001271
Akron941c1a62016-02-23 17:41:41 +01001272=item B<--pretty|-y>
1273
Akronc13a1702016-03-15 19:33:14 +01001274Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001275This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001276
Akronf73ffb62018-06-27 12:13:59 +02001277
Akron941c1a62016-02-23 17:41:41 +01001278=item B<--gzip|-z>
1279
Akronf7ad89e2016-03-16 18:22:47 +01001280Compress the output.
1281Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001282
Akronf73ffb62018-06-27 12:13:59 +02001283
Akron11c80302016-03-18 19:44:43 +01001284=item B<--cache|-c>
1285
1286File to mmap a cache (using L<Cache::FastMmap>).
1287Defaults to C<korapxml2krill.cache> in the calling directory.
1288
Akronf73ffb62018-06-27 12:13:59 +02001289
Akron11c80302016-03-18 19:44:43 +01001290=item B<--cache-size|-cs>
1291
1292Size of the cache. Defaults to C<50m>.
1293
Akronf73ffb62018-06-27 12:13:59 +02001294
Akron11c80302016-03-18 19:44:43 +01001295=item B<--cache-init|-ci>
1296
1297Initialize cache file.
1298Can be flagged using C<--no-cache-init> as well.
1299Defaults to C<true>.
1300
Akronf73ffb62018-06-27 12:13:59 +02001301
Akron11c80302016-03-18 19:44:43 +01001302=item B<--cache-delete|-cd>
1303
1304Delete cache file after processing.
1305Can be flagged using C<--no-cache-delete> as well.
1306Defaults to C<true>.
1307
Akronf73ffb62018-06-27 12:13:59 +02001308
Akron636aa112017-04-07 18:48:56 +02001309=item B<--config|-cfg>
1310
1311Configure the parameters of your call in a file
1312of key-value pairs with whitespace separator
1313
1314 overwrite 1
1315 token DeReKo#Structure
1316 ...
1317
1318Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001319C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001320C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001321C<output>,
1322C<temp-extract>, C<sequential-extraction>,
1323C<base-sentences>, C<base-paragraphs>,
1324C<base-pagebreaks>,
1325C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001326(semicolon separated), C<anno> (semicolon separated).
1327
Akronf73ffb62018-06-27 12:13:59 +02001328Configuration parameters will always be overwritten by
1329passed parameters.
1330
1331
Akron81500102017-04-07 20:45:44 +02001332=item B<--temporary-extract|-te>
1333
1334Only valid for the C<archive> command.
1335
1336This will first extract all files into a
1337directory and then will archive.
1338If the directory is given as C<:temp:>,
1339a temporary directory is used.
1340This is especially useful to avoid
1341massive unzipping and potential
1342network latency.
Akron636aa112017-04-07 18:48:56 +02001343
Akronf73ffb62018-06-27 12:13:59 +02001344
Akrone10ad322016-02-27 10:54:26 +01001345=item B<--sigle|-sg>
1346
Akron20807582016-10-26 17:11:34 +02001347Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001348Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001349I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001350Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001351In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001352On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001353
Akronf73ffb62018-06-27 12:13:59 +02001354
Akron941c1a62016-02-23 17:41:41 +01001355=item B<--log|-l>
1356
1357The L<Log4perl> log level, defaults to C<ERROR>.
1358
Akronf73ffb62018-06-27 12:13:59 +02001359
Akron941c1a62016-02-23 17:41:41 +01001360=item B<--help|-h>
1361
1362Print this document.
1363
Akronf73ffb62018-06-27 12:13:59 +02001364
Akron941c1a62016-02-23 17:41:41 +01001365=item B<--version|-v>
1366
1367Print version information.
1368
1369=back
1370
Akronf73ffb62018-06-27 12:13:59 +02001371
Akronc13a1702016-03-15 19:33:14 +01001372=head1 ANNOTATION SUPPORT
1373
1374L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1375developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1376The base foundry with paragraphs, sentences, and the text element are mandatory for
1377L<Krill|https://github.com/KorAP/Krill>.
1378
Akron821db3d2017-04-06 21:19:31 +02001379 Base
1380 #Paragraphs
1381 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001382
Akron821db3d2017-04-06 21:19:31 +02001383 Connexor
1384 #Morpho
1385 #Phrase
1386 #Sentences
1387 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001388
Akron821db3d2017-04-06 21:19:31 +02001389 CoreNLP
1390 #Constituency
1391 #Morpho
1392 #NamedEntities
1393 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001394
Akronce125b62017-06-19 11:54:36 +02001395 CMC
1396 #Morpho
1397
Akron821db3d2017-04-06 21:19:31 +02001398 DeReKo
1399 #Structure
Akronc13a1702016-03-15 19:33:14 +01001400
Akron821db3d2017-04-06 21:19:31 +02001401 DRuKoLa
1402 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001403
Akron821db3d2017-04-06 21:19:31 +02001404 Glemm
1405 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001406
Akronea1aed52018-07-19 14:43:34 +02001407 HNC
1408 #Morpho
1409
Akron4c679192018-01-16 17:41:49 +01001410 LWC
1411 #Dependency
1412
Akron821db3d2017-04-06 21:19:31 +02001413 Malt
1414 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001415
Akron821db3d2017-04-06 21:19:31 +02001416 MarMoT
1417 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001418
Akron821db3d2017-04-06 21:19:31 +02001419 Mate
1420 #Dependency
1421 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001422
Akron821db3d2017-04-06 21:19:31 +02001423 MDParser
1424 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001425
Akron821db3d2017-04-06 21:19:31 +02001426 OpenNLP
1427 #Morpho
1428 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001429
Akron821db3d2017-04-06 21:19:31 +02001430 Sgbr
1431 #Lemma
1432 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001433
Akron821db3d2017-04-06 21:19:31 +02001434 TreeTagger
1435 #Morpho
1436 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001437
Akron821db3d2017-04-06 21:19:31 +02001438 XIP
1439 #Constituency
1440 #Morpho
1441 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001442
Akronc13a1702016-03-15 19:33:14 +01001443
1444More importers are in preparation.
1445New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1446See the built-in annotation importers as examples.
1447
Akronf73ffb62018-06-27 12:13:59 +02001448
Akron941c1a62016-02-23 17:41:41 +01001449=head1 AVAILABILITY
1450
1451 https://github.com/KorAP/KorAP-XML-Krill
1452
1453
1454=head1 COPYRIGHT AND LICENSE
1455
Akroned9baf02019-01-22 17:03:25 +01001456Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001457
Akron941c1a62016-02-23 17:41:41 +01001458Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001459
Akrona76d8352016-10-27 16:27:32 +02001460Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001461
1462L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1463Corpus Analysis Platform at the
1464L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1465member of the
1466L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1467
1468This program is free software published under the
1469L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1470
1471=cut