blob: c264436a421169f10882a1a54dccca35e0dec702 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron941c1a62016-02-23 17:41:41 +0100126# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100127
Akron3abc03e2017-06-29 16:23:35 +0200128our $LAST_CHANGE = '2017/06/29';
Akron941c1a62016-02-23 17:41:41 +0100129our $LOCAL = $FindBin::Bin;
130our $VERSION_MSG = <<"VERSION";
131Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
132VERSION
133
Akron63f20d42017-04-10 23:40:29 +0200134# Prototypes
135sub get_file_name_from_glob($);
136sub get_file_name($);
137
Akron941c1a62016-02-23 17:41:41 +0100138# Parse comand
139my $cmd;
140our @ARGV;
141if ($ARGV[0] && index($ARGV[0], '-') != 0) {
142 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100143};
Akron63f20d42017-04-10 23:40:29 +0200144my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100145
Akron5f51d422016-08-16 16:26:43 +0200146my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100147my $text;
Akrone10ad322016-02-27 10:54:26 +0100148
Akron941c1a62016-02-23 17:41:41 +0100149# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000150GetOptions(
Akron08385f62016-03-22 20:37:04 +0100151 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200152 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100153 'output|o=s' => \(my $output),
154 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100155 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200156 'token|t=s' => \(my $token_base),
157 'base-sentences|bs=s' => \(my $base_sentences),
158 'base-paragraphs|bp=s' => \(my $base_paragraphs),
159 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100160 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200161 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100162 'skip|s=s' => \@skip,
163 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200164 'cache|c=s' => \(my $cache_file),
165 'config|cfg=s' => \(my $cfg_file),
166 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200167 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100168 'primary|p!' => \(my $primary),
169 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200170 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200171 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200172 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200173 'cache-size|cs=s' => \(my $cache_size),
174 'cache-delete|cd!' => \(my $cache_delete),
175 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100176 'help|h' => sub {
177 pod2usage(
178 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200179 -verbose => 99,
180 -msg => $VERSION_MSG,
181 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100182 );
183 },
184 'version|v' => sub {
185 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200186 -verbose => 0,
187 -msg => $VERSION_MSG,
188 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100189 )
190 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000191);
192
Akron63f20d42017-04-10 23:40:29 +0200193
Akron636aa112017-04-07 18:48:56 +0200194# Load from configuration
195if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200196 my %config;
197
198 Config::Simple->import_from($cfg_file, \%config);
199
200 # Overwrite
201 if (!defined($overwrite) && defined $config{overwrite}) {
202 $overwrite = $config{overwrite};
203 };
204
205 # Gzip
206 if (!defined($gzip) && defined $config{gzip}) {
207 $gzip = $config{gzip};
208 };
209
210 # Jobs
211 if (!defined($jobs) && defined $config{jobs}) {
212 $jobs = $config{jobs};
213 };
214
Akron63f20d42017-04-10 23:40:29 +0200215 # Input root base directory
216 if (!defined($input_base) && defined $config{'input-base'}) {
217 $input_base = $config{'input-base'};
218 };
219
Akron81500102017-04-07 20:45:44 +0200220 # temporary-extract
221 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
222 $extract_dir = $config{'temporary-extract'};
223 };
224
Akron636aa112017-04-07 18:48:56 +0200225 # Token base
226 if (!defined($token_base) && defined $config{token}) {
227 $token_base = $config{token};
228 };
229
230 # Cache file
231 if (!defined($cache_file) && defined $config{cache}) {
232 $cache_file = $config{cache};
233 };
234
235 # Cache size
236 if (!defined($cache_size) && defined $config{'cache-size'}) {
237 $cache_size = $config{'cache-size'};
238 };
239
240 # Cache delete
241 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
242 $cache_delete = $config{'cache-delete'} ;
243 };
244
245 # Cache init
246 if (!(defined $cache_init) && defined $config{'cache-init'}) {
247 $cache_init = $config{'cache-init'} ;
248 };
249
Akron9ec88872017-04-12 16:29:06 +0200250 # Jobs for extraction
251 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
252 $sequential_extraction = $config{'sequential-extraction'} ;
253 };
254
Akron636aa112017-04-07 18:48:56 +0200255 # Meta
256 if (!(defined $meta) && defined $config{'meta'}) {
257 $meta = $config{'meta'} ;
258 };
259
260 # Output
261 if (!(defined $output) && defined $config{'output'}) {
262 $output = $config{'output'} ;
263 };
264
265 # Base-sentences
266 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
267 $base_sentences = $config{'base-sentences'} ;
268 };
269
270 # Base-paragraphs
271 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
272 $base_paragraphs = $config{'base-paragraphs'} ;
273 };
274
275 # Base-pagebreaks
276 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
277 $base_pagebreaks = $config{'base-pagebreaks'} ;
278 };
279
Akron081639e2017-04-21 19:01:39 +0200280 # Write to tar
281 if (!(defined $to_tar) && defined $config{'to-tar'}) {
282 $to_tar = $config{'to-tar'} ;
283 };
284
Akron636aa112017-04-07 18:48:56 +0200285 # Log
286 if (!(defined $log_level) && defined $config{'log'}) {
287 $log_level = $config{'log'} ;
288 };
289
290 # Skip
291 if (!scalar(@skip) && defined $config{'skip'}) {
292 @skip = split /\s*;\s*/, $config{'skip'} ;
293 };
294
295 # Sigle
296 if (!scalar(@sigle) && defined $config{'sigle'}) {
297 @sigle = split /\s*;\s*/, $config{'sigle'} ;
298 };
299
300 # Anno
301 if (!scalar(@anno) && defined $config{'anno'}) {
302 @anno = split /\s*;\s*/, $config{'anno'} ;
303 };
304};
305
Akron63f20d42017-04-10 23:40:29 +0200306
Akron636aa112017-04-07 18:48:56 +0200307# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200308$token_base //= 'OpenNLP#tokens';
309$cache_file //= 'korapxml2krill.cache';
310$cache_size //= '50m';
311$jobs //= 0;
312$cache_delete //= 1;
313$cache_init //= 1;
314$sequential_extraction //= 0;
315$log_level //= 'ERROR';
316$base_sentences //= '';
317$base_paragraphs //= '';
318$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200319
Akron821db3d2017-04-06 21:19:31 +0200320$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100321$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100322$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100323
Akron63f20d42017-04-10 23:40:29 +0200324
325# Initialize log4perl object
326Log::Log4perl->init({
327 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
328 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
329 'log4perl.appender.STDERR.layout' => 'PatternLayout',
330 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
331});
332
333my $log = Log::Log4perl->get_logger('main');
334
335
336print "Reading config from $cfg_file\n" if $cfg_file;
337
338
Akron941c1a62016-02-23 17:41:41 +0100339my %ERROR_HASH = (
340 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200341 -verbose => 99,
342 -msg => $VERSION_MSG,
343 -output => '-',
344 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100345);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000346
Akron941c1a62016-02-23 17:41:41 +0100347# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100348pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349
Akrone1dbc382016-07-08 22:24:52 +0200350# Gzip has no effect, if no output is given
351pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352
Akronc11f7982017-02-21 21:20:14 +0100353
Akron636aa112017-04-07 18:48:56 +0200354if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100355 state $cores = Sys::Info->new->device('CPU')->count;
356 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200357 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100358};
359
Akron821db3d2017-04-06 21:19:31 +0200360
Akron63f20d42017-04-10 23:40:29 +0200361# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200362if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200363
Akron486f9ab2017-04-22 23:25:19 +0200364 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200365 $log->error("Directory '$output' does not exist.");
366 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200367 };
368
369 # Remove all inputs
370 my $remove_next = 0;
371 @keep_argv = @{c(@keep_argv)->grep(
372 sub {
373 # Input flag
374 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
375 $remove_next = 1;
376 return 0;
377 }
378
379 # input value
380 elsif ($remove_next) {
381 $remove_next = 0;
382 return 0;
383 };
384
385 # Pass parameter
386 return 1;
387 }
388 )->to_array};
389
390
391 # Iterate over all inputs
392 foreach (@input) {
393
Akron081639e2017-04-21 19:01:39 +0200394 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200395 my $new_out = catdir($output, get_file_name_from_glob($_));
396
Akron486f9ab2017-04-22 23:25:19 +0200397 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200398 unless ($to_tar) {
399 if (make_path($new_out) == 0 && !-d $new_out) {
400 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200401 exit 1;
Akron081639e2017-04-21 19:01:39 +0200402 };
Akron63f20d42017-04-10 23:40:29 +0200403 };
404
405 # Create archive command
406 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
407 print "Start serial processing of $_ to $new_out\n";
408
409 # Start archiving
410 system @archive_cmd;
411 };
412
Akron3abc03e2017-06-29 16:23:35 +0200413 exit;
Akron63f20d42017-04-10 23:40:29 +0200414};
415
Akrone1dbc382016-07-08 22:24:52 +0200416my %skip;
417$skip{lc($_)} = 1 foreach @skip;
418
419my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100420push(@layers, ['Base', 'Sentences']) unless $base_sentences;
421push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200422
423# Connexor
424push(@layers, ['Connexor', 'Morpho']);
425push(@layers, ['Connexor', 'Syntax']);
426push(@layers, ['Connexor', 'Phrase']);
427push(@layers, ['Connexor', 'Sentences']);
428
429# CoreNLP
430push(@layers, ['CoreNLP', 'NamedEntities']);
431push(@layers, ['CoreNLP', 'Sentences']);
432push(@layers, ['CoreNLP', 'Morpho']);
433push(@layers, ['CoreNLP', 'Constituency']);
434
Akronce125b62017-06-19 11:54:36 +0200435# CMC
436push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100437
Akrone1dbc382016-07-08 22:24:52 +0200438# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100439my @dereko_attr = ();
440if ($base_sentences eq 'dereko#structure') {
441 push @dereko_attr, 'sentences';
442};
443if ($base_paragraphs eq 'dereko#structure') {
444 push @dereko_attr, 'paragraphs';
445};
Akron636bd9c2017-02-09 17:13:00 +0100446
Akron41ac10b2017-02-08 22:47:25 +0100447if ($base_pagebreaks eq 'dereko#structure') {
448 push @dereko_attr, 'pagebreaks';
449};
450
451if ($dereko_attr[0]) {
452 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100453}
454else {
455 push(@layers, ['DeReKo', 'Structure']);
456};
Akrone1dbc382016-07-08 22:24:52 +0200457
458# Glemm
459push(@layers, ['Glemm', 'Morpho']);
460
461# Malt
462push(@layers, ['Malt', 'Dependency']);
463
464# MDParser
465push(@layers, ['MDParser', 'Dependency']);
466
467# Mate
468push(@layers, ['Mate', 'Morpho']);
469push(@layers, ['Mate', 'Dependency']);
470
471# OpenNLP
472push(@layers, ['OpenNLP', 'Morpho']);
473push(@layers, ['OpenNLP', 'Sentences']);
474
475# Schreibgebrauch
476push(@layers, ['Sgbr', 'Lemma']);
477push(@layers, ['Sgbr', 'Morpho']);
478
479# TreeTagger
480push(@layers, ['TreeTagger', 'Morpho']);
481push(@layers, ['TreeTagger', 'Sentences']);
482
483# XIP
484push(@layers, ['XIP', 'Morpho']);
485push(@layers, ['XIP', 'Constituency']);
486push(@layers, ['XIP', 'Sentences']);
487push(@layers, ['XIP', 'Dependency']);
488
Akron4fa37c32017-01-20 14:43:10 +0100489# DRuKoLa
490push(@layers, ['DRuKoLa', 'Morpho']);
491
Akron3bd942f2017-02-20 20:09:14 +0100492# Marmot
493push(@layers, ['MarMoT', 'Morpho']);
494
Akron4fa37c32017-01-20 14:43:10 +0100495
Akrone1dbc382016-07-08 22:24:52 +0200496# Check filters
497my @filtered_anno;
498if ($skip{'#all'}) {
499 foreach (@anno) {
500 push @filtered_anno, [ split('#', $_) ];
501 };
502}
503
504# Add all annotations that are not skipped
505else {
506 # Add to index file - respect skipping
507 foreach my $info (@layers) {
508 # Skip if Foundry or Foundry#Layer should be skipped
509 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
510 push @filtered_anno, $info;
511 };
512 };
513};
514
515# Get tokenization basis
516my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
517
518# TODO: This should not be initialized for batch
519my $cache = Cache::FastMmap->new(
520 share_file => $cache_file,
521 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200522 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200523);
524
Akron03b24db2016-08-16 20:54:32 +0200525# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200526my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200527 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200528 meta_type => $meta,
529 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200530 foundry => $token_base_foundry,
531 layer => $token_base_layer,
532 gzip => $gzip,
533 log => $log,
534 primary => $primary,
535 pretty => $pretty,
536 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200537);
538
Akron941c1a62016-02-23 17:41:41 +0100539# Get file name based on path information
540sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100541 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200542 if (-d $i) {
543 $i =~ s![^\/]+$!!;
544 };
Akron941c1a62016-02-23 17:41:41 +0100545 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200546
547 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200548 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100549 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100550 $file =~ tr/\//-/;
551 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200552 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100553 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000554};
555
Akron63f20d42017-04-10 23:40:29 +0200556
557sub get_file_name_from_glob ($) {
558 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200559 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200560 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
561 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
562 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
563 $glob =~ s/^-//; # Clean beginning
564 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200565 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200566 return $glob;
567};
568
569
Akrone10ad322016-02-27 10:54:26 +0100570# Convert sigle to path construct
571s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
572
Akron7d4cdd82016-08-17 21:39:45 +0200573if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200574 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200575 $log->error("Directory '$output' does not exist.");
576 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200577 };
578};
579
Akron63f20d42017-04-10 23:40:29 +0200580
581# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200582if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200583
Akron821db3d2017-04-06 21:19:31 +0200584 my @new_input = ();
585
586 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200587 foreach my $wild_card (@input) {
588
589 # Prefix with input root
590 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
591
592 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200593 };
594
Akron63f20d42017-04-10 23:40:29 +0200595 # Sort files by length
596 @input = sort { length($a) <=> length($b) } @new_input;
597
598 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200599};
600
601
Akron941c1a62016-02-23 17:41:41 +0100602# Process a single file
603unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100604 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000605
Akron941c1a62016-02-23 17:41:41 +0100606 BEGIN {
607 $main::TIME = Benchmark->new;
608 $main::LAST_STOP = Benchmark->new;
609 };
610
611 sub stop_time {
612 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200613 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100614 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200615 timestr(timediff($new, $main::LAST_STOP)) .
616 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
617 );
Akron941c1a62016-02-23 17:41:41 +0100618 $main::LAST_STOP = $new;
619 };
620
621 # Create and parse new document
622 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100623
Akron7d4cdd82016-08-17 21:39:45 +0200624 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200625 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100626
Akron11c80302016-03-18 19:44:43 +0100627 # Delete cache file
628 unlink($cache_file) if $cache_delete;
629
Akron5f51d422016-08-16 16:26:43 +0200630 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200631 exit;
Akron81500102017-04-07 20:45:44 +0200632};
633
Nils Diewald59094f22014-11-05 18:20:50 +0000634
Akrone10ad322016-02-27 10:54:26 +0100635# Extract XML files
Akron81500102017-04-07 20:45:44 +0200636if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100637
Akrond5643ad2017-07-04 20:27:13 +0200638 # Output is required
639 pod2usage(%ERROR_HASH) unless $output;
640
Akron7d4cdd82016-08-17 21:39:45 +0200641 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200642 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100643
Akron7d4cdd82016-08-17 21:39:45 +0200644 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100645 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200646 $log->error("Unzip is not installed or incompatible.");
647 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100648 };
649
Akronb0c88db2016-06-29 16:33:18 +0200650 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200651 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200652
Akron651cb8d2016-08-16 21:44:49 +0200653 my $prefix = 1;
654
Akron03b24db2016-08-16 20:54:32 +0200655 # No sigles given
656 unless (@sigle) {
657
658 # Get files
659 foreach ($archive->list_texts) {
660
661 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200662 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200663
664 # TODO: Make this OS independent
665 push @sigle, join '/', $corpus, $doc, $text;
666 };
Akron20807582016-10-26 17:11:34 +0200667 }
668
669 # Check sigle for doc sigles
670 else {
671 my @new_sigle;
672
673 my $prefix_check = 0;
674
675 # Iterate over all sigle
676 foreach (@sigle) {
677
678 # Sigle is a doc sigle
679 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200680
Akron60a8caa2017-02-17 21:51:27 +0100681 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200682 # Check if a prefix is needed
683 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100684
685 if ($prefix = $archive->check_prefix) {
686 print " with prefix ...";
687 };
Akron20807582016-10-26 17:11:34 +0200688 $prefix_check = 1;
689 };
690
Akron60a8caa2017-02-17 21:51:27 +0100691 print "\n";
692
Akron20807582016-10-26 17:11:34 +0200693 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200694 my $path = ($prefix ? './' : '') . $_;
695
696 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200697 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200698 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200699 ) ? '' : 'not '
700 );
701 print "extracted.\n";
702 }
Akron60a8caa2017-02-17 21:51:27 +0100703
704 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200705 else {
706 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100707
708 unless ($prefix_check) {
709
710 if ($prefix = $archive->check_prefix) {
711 print " with prefix ...";
712 };
713 $prefix_check = 1;
714 };
Akron20807582016-10-26 17:11:34 +0200715 };
716 };
717 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200718 };
719
Akrone10ad322016-02-27 10:54:26 +0100720 # Iterate over all given sigles and extract
721 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100722
Akron2812ba22016-10-28 21:55:59 +0200723 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200724
Akron03b24db2016-08-16 20:54:32 +0200725 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200726 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100727
Akron20807582016-10-26 17:11:34 +0200728 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200729 ($prefix ? './' : '') . $_, $output
730 ) ? '' : 'not '
731 );
Akrone10ad322016-02-27 10:54:26 +0100732 print "extracted.\n";
733 };
Akronb0c88db2016-06-29 16:33:18 +0200734 }
Akron7d4cdd82016-08-17 21:39:45 +0200735
736 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200737 else {
738 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200739 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100740 };
741}
742
Akron81500102017-04-07 20:45:44 +0200743
Akron941c1a62016-02-23 17:41:41 +0100744# Process an archive
745elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000746
Akron81500102017-04-07 20:45:44 +0200747 my $archive_output;
748
749 # First extract, then archive
750 if (defined $extract_dir) {
751
752 # Create new archive object
753 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
754
755 # Check zip capabilities
756 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200757 $log->error("Unzip is not installed or incompatible.");
758 exit 1;
Akron81500102017-04-07 20:45:44 +0200759 };
760
761 # Add further annotation archived
762 $archive->attach($_) foreach @input[1..$#input];
763
764 # Create a temporary directory
765 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200766 $extract_dir = tempdir(CLEANUP => 0);
767 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200768 };
769
Akron63f20d42017-04-10 23:40:29 +0200770 # Add some random extra to avoid clashes with multiple archives
771 $extract_dir = catdir($extract_dir, random_string('cccccc'));
772
773 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200774 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200775 @input = ($extract_dir);
776 }
777 else {
778 $log->error('Unable to extract from primary archive ' . $input[0] .
779 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200780 exit 1;
Akron81500102017-04-07 20:45:44 +0200781 };
782 }
783
784 # Can't create archive object
785 else {
786 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200787 exit 1;
Akron81500102017-04-07 20:45:44 +0200788 };
789 };
790
Akrone1dbc382016-07-08 22:24:52 +0200791 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100792
Akron7d4cdd82016-08-17 21:39:45 +0200793 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100794 my $pool = Parallel::ForkManager->new($jobs);
795
Akron7d4cdd82016-08-17 21:39:45 +0200796 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100797 my $iter = 1; # Current text in process
798
Akronda3097e2017-04-23 19:53:57 +0200799 my $tar_archive;
800 my $output_dir = $output;
801 my $tar_fh;
802
803 # Initialize tar archive
804 if ($to_tar) {
805 $tar_archive = Archive::Tar::Builder->new(
806 ignore_errors => 1
807 );
808
809 # Set output name
810 my $tar_file = $output;
811 unless ($tar_file =~ /\.tar$/) {
812 $tar_file .= '.tar';
813 };
814
815 # Initiate the tar file
816 print "Writing to file $tar_file\n";
817 $tar_fh = IO::File->new($tar_file, 'w');
818 $tar_fh->binmode(1);
819
820 # Set handle
821 $tar_archive->set_handle($tar_fh);
822
823 # Output to temporary directory
824 $output_dir = File::Temp->newdir;
825 };
826
Akron941c1a62016-02-23 17:41:41 +0100827 # Report on fork message
828 $pool->run_on_finish (
829 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200830 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100831 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200832
Akron08385f62016-03-22 20:37:04 +0100833 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200834 ($iter++) . "/$count]" .
835 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200836 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200837
838 if (!$code && $to_tar && $data->[2]) {
839 my $filename = $data->[2];
840
841 # Lock filehandle
842 if (flock($tar_fh, LOCK_EX)) {
843
Akron9a062ce2017-07-04 19:12:05 +0200844 my $clean_file = fileparse($filename);
845
Akronda3097e2017-04-23 19:53:57 +0200846 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200847 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200848 unlink $filename;
849
850 # Unlock filehandle
851 flock($tar_fh, LOCK_UN);
852 }
853 else {
854 $log->warn("Unable to add $filename to archive");
855 };
856 };
857
Akron4c0cf312016-10-15 16:42:09 +0200858 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100859 }
860 );
861
862 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200863 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100864 print "Reading data ...\n";
865
Akron7d4cdd82016-08-17 21:39:45 +0200866 # unless (Cache::FastMmap->new(
867 # share_file => $cache_file,
868 # cache_size => $cache_size,
869 # init_file => $cache_init
870 # )) {
871 # print "Unable to intialize cache '$cache_file'\n\n";
872 # exit(1);
873 # };
Akron11c80302016-03-18 19:44:43 +0100874
Akron486f9ab2017-04-22 23:25:19 +0200875
Akron941c1a62016-02-23 17:41:41 +0100876 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100877 if (-d $input[0]) {
878 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100879 my @dirs;
880 my $dir;
881
Akron7d4cdd82016-08-17 21:39:45 +0200882 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100883 while (1) {
884 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200885 push @dirs, $dir;
886 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100887 };
888 last unless $it->next;
889 };
890
891 print "Start processing ...\n";
892 $t = Benchmark->new;
893 $count = scalar @dirs;
894
895 DIRECTORY_LOOP:
896 for (my $i = 0; $i < $count; $i++) {
897
Akrone1dbc382016-07-08 22:24:52 +0200898 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200899 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200900 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200901 );
Akron941c1a62016-02-23 17:41:41 +0100902
903 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200904 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200905
Akron13d56622016-10-31 14:54:49 +0100906 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200907 $pool->finish(
908 0,
Akronda3097e2017-04-23 19:53:57 +0200909 [
910 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
911 undef,
912 $filename
913 ]
Akron486f9ab2017-04-22 23:25:19 +0200914 );
Akron3ec48972016-08-17 23:24:52 +0200915 }
916 else {
Akron4c0cf312016-10-15 16:42:09 +0200917 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200918 };
Akron941c1a62016-02-23 17:41:41 +0100919 };
920 }
921
922 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200923 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200924
Akron941c1a62016-02-23 17:41:41 +0100925 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200926 $log->error("Unzip is not installed or incompatible.");
927 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100928 };
929
Akron08385f62016-03-22 20:37:04 +0100930 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200931 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100932
Akron941c1a62016-02-23 17:41:41 +0100933 print "Start processing ...\n";
934 $t = Benchmark->new;
935 my @dirs = $archive->list_texts;
936 $count = scalar @dirs;
937
938 ARCHIVE_LOOP:
939 for (my $i = 0; $i < $count; $i++) {
940
941 # Split path information
942 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
943
Akrone1dbc382016-07-08 22:24:52 +0200944 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200945 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200946 get_file_name(
947 catfile($corpus, $doc, $text)
948 . '.json' . ($gzip ? '.gz' : '')
949 )
Akrone1dbc382016-07-08 22:24:52 +0200950 );
Akron941c1a62016-02-23 17:41:41 +0100951
952 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200953 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100954
Akron4c0cf312016-10-15 16:42:09 +0200955 # Create temporary file
956 $temp = File::Temp->newdir;
957
Akronbdf434a2016-10-24 17:42:07 +0200958 # TODO: Check if $filename exist at the beginning,
959 # because extraction can be horrible slow!
960
Akron941c1a62016-02-23 17:41:41 +0100961 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200962 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100963
Akron7d4cdd82016-08-17 21:39:45 +0200964 # Create corpus directory
965 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100966
Akron7d4cdd82016-08-17 21:39:45 +0200967 # Temporary directory
968 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100969
Akron7d4cdd82016-08-17 21:39:45 +0200970 # Write file
Akron13d56622016-10-31 14:54:49 +0100971 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200972
Akron4c0cf312016-10-15 16:42:09 +0200973 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100974 $pool->finish(
975 0,
Akronda3097e2017-04-23 19:53:57 +0200976 [
977 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
978 $temp,
979 $filename
980 ]
Akron13d56622016-10-31 14:54:49 +0100981 );
982 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200983 }
984 else {
Akron4c0cf312016-10-15 16:42:09 +0200985 # Delete temporary file
986 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200987 };
Akron941c1a62016-02-23 17:41:41 +0100988 }
Akron7d4cdd82016-08-17 21:39:45 +0200989
990 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100991 else {
Akron4c0cf312016-10-15 16:42:09 +0200992 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100993 };
994 };
995 }
996
997 else {
998 print "Input is neither a directory nor an archive.\n\n";
999 };
1000
1001 $pool->wait_all_children;
1002
Akron11c80302016-03-18 19:44:43 +01001003 # Delete cache file
1004 unlink($cache_file) if $cache_delete;
1005
Akronda3097e2017-04-23 19:53:57 +02001006 # Close tar filehandle
1007 if ($to_tar && $tar_fh) {
1008 $tar_archive->finish;
1009 $tar_fh->close;
1010 print "Wrote to tar archive.\n";
1011 };
1012
Akron63f20d42017-04-10 23:40:29 +02001013 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001014 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001015};
Akron941c1a62016-02-23 17:41:41 +01001016
Nils Diewald2db9ad02013-10-29 19:26:43 +00001017
Akron63f20d42017-04-10 23:40:29 +02001018# Cleanup temporary extraction directory
1019if ($extract_dir) {
1020 my $objects = remove_tree($extract_dir, { safe => 1 });
1021 print "Removed directory $extract_dir with $objects objects.\n";
1022};
1023
1024
1025print "\n";
1026
Nils Diewald2db9ad02013-10-29 19:26:43 +00001027__END__
Akron941c1a62016-02-23 17:41:41 +01001028
1029=pod
1030
1031=encoding utf8
1032
1033=head1 NAME
1034
Akronf7ad89e2016-03-16 18:22:47 +01001035korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001036
1037
1038=head1 SYNOPSIS
1039
Akrona76d8352016-10-27 16:27:32 +02001040 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001041
Akron2fd402b2016-10-27 21:26:48 +02001042
Akron941c1a62016-02-23 17:41:41 +01001043=head1 DESCRIPTION
1044
1045L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1046compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001047The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001048
1049
1050=head1 INSTALLATION
1051
1052The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1053
Akronaf386982016-10-12 00:33:25 +02001054 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001055
Akronc13a1702016-03-15 19:33:14 +01001056In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001057be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001058Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001059In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001060
1061=head1 ARGUMENTS
1062
Akrona76d8352016-10-27 16:27:32 +02001063 $ korapxml2krill -z --input <directory> --output <filename>
1064
1065Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001066It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001067
Akron941c1a62016-02-23 17:41:41 +01001068=over 2
1069
1070=item B<archive>
1071
Akron081639e2017-04-21 19:01:39 +02001072 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001073
Akron2fd402b2016-10-27 21:26:48 +02001074Converts an archive of KorAP-XML documents. It expects a directory
1075(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001076
1077=item B<extract>
1078
Akrona76d8352016-10-27 16:27:32 +02001079 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1080
1081Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001082
Akron63f20d42017-04-10 23:40:29 +02001083=item B<serial>
1084
1085 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1086
1087Convert archives sequentially. The inputs are not merged but treated
1088as they are (so they may be premerged or globs).
1089the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001090are created based on the archive name. In case the C<--to-tar> flag is given,
1091the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001092
1093
Akron941c1a62016-02-23 17:41:41 +01001094=back
1095
1096
1097=head1 OPTIONS
1098
1099=over 2
1100
Akrona76d8352016-10-27 16:27:32 +02001101=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001102
Akrona76d8352016-10-27 16:27:32 +02001103Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001104
Akron7606afa2016-10-25 16:23:49 +02001105Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001106document, while C<archive> expects a KorAP-XML corpus folder or a zip
1107file to batch process multiple files.
1108C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001109
Akrona76d8352016-10-27 16:27:32 +02001110C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001111that the first archive listed contains all primary data files
1112and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001113
Akron7606afa2016-10-25 16:23:49 +02001114 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001115
Akron821db3d2017-04-06 21:19:31 +02001116Input may also be defined using BSD glob wildcards.
1117
1118 -i 'file/news*.zip'
1119
1120The extended input array will be sorted in length order, so the shortest
1121path needs to contain all primary data files and all meta data files.
1122
Akron0c3e3752016-06-28 15:55:53 +02001123(The directory structure follows the base directory format,
1124that may include a C<.> root folder.
1125In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001126need to be passed with a hash sign in front of the archive's name.
1127This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001128
Akron7606afa2016-10-25 16:23:49 +02001129To support zip files, a version of C<unzip> needs to be installed that is
1130compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001131
Akron7606afa2016-10-25 16:23:49 +02001132B<The root folder switch using the hash sign is experimental and
1133may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001134
Akron63f20d42017-04-10 23:40:29 +02001135=item B<--input-base|-ib> <directory>
1136
1137The base directory for inputs.
1138
1139
Akron941c1a62016-02-23 17:41:41 +01001140=item B<--output|-o> <directory|file>
1141
1142Output folder for archive processing or
1143document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001144writes to C<STDOUT> by default
1145(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001146
1147=item B<--overwrite|-w>
1148
1149Overwrite files that already exist.
1150
Akron3741f8b2016-12-21 19:55:21 +01001151=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001152
1153Define the default tokenization by specifying
1154the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001155of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001156
Akron3741f8b2016-12-21 19:55:21 +01001157
1158=item B<--base-sentences|-bs> <foundry>#<layer>
1159
1160Define the layer for base sentences.
1161If given, this will be used instead of using C<Base#Sentences>.
1162Currently C<DeReKo#Structure> is the only additional layer supported.
1163
1164 Defaults to unset.
1165
1166
1167=item B<--base-paragraphs|-bp> <foundry>#<layer>
1168
1169Define the layer for base paragraphs.
1170If given, this will be used instead of using C<Base#Paragraphs>.
1171Currently C<DeReKo#Structure> is the only additional layer supported.
1172
1173 Defaults to unset.
1174
1175
Akron41ac10b2017-02-08 22:47:25 +01001176=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1177
1178Define the layer for base pagebreaks.
1179Currently C<DeReKo#Structure> is the only layer supported.
1180
1181 Defaults to unset.
1182
1183
Akron941c1a62016-02-23 17:41:41 +01001184=item B<--skip|-s> <foundry>[#<layer>]
1185
Akronf7ad89e2016-03-16 18:22:47 +01001186Skip specific annotations by specifying the foundry
1187(and optionally the layer with a C<#>-prefix),
1188e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001189Can be set multiple times.
1190
Akronc13a1702016-03-15 19:33:14 +01001191=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001192
Akronf7ad89e2016-03-16 18:22:47 +01001193Convert specific annotations by specifying the foundry
1194(and optionally the layer with a C<#>-prefix),
1195e.g. C<Mate> or C<Mate#Morpho>.
1196Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001197
1198=item B<--primary|-p>
1199
Akronc13a1702016-03-15 19:33:14 +01001200Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001201Can be flagged using C<--no-primary> as well.
1202This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001203
1204=item B<--jobs|-j>
1205
1206Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001207for archive processing.
Akron11c80302016-03-18 19:44:43 +01001208Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001209
1210If C<sequential-extraction> is not set to false, this will
1211also apply to extraction.
1212
Akronc11f7982017-02-21 21:20:14 +01001213Pass -1, and the value will be set automatically to 5
1214times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001215This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001216
Akron9ec88872017-04-12 16:29:06 +02001217=item B<--sequential-extraction|-se>
1218
1219Flag to indicate, if the C<jobs> value also applies to extraction.
1220Some systems may have problems with extracting multiple archives
1221to the same folder at the same time.
1222Can be flagged using C<--no-sequential-extraction> as well.
1223Defaults to C<false>.
1224
Akron35db6e32016-03-17 22:42:22 +01001225=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001226
Akron35db6e32016-03-17 22:42:22 +01001227Define the metadata parser to use. Defaults to C<I5>.
1228Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1229This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001230
1231=item B<--pretty|-y>
1232
Akronc13a1702016-03-15 19:33:14 +01001233Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001234This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001235
1236=item B<--gzip|-z>
1237
Akronf7ad89e2016-03-16 18:22:47 +01001238Compress the output.
1239Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001240
Akron11c80302016-03-18 19:44:43 +01001241=item B<--cache|-c>
1242
1243File to mmap a cache (using L<Cache::FastMmap>).
1244Defaults to C<korapxml2krill.cache> in the calling directory.
1245
1246=item B<--cache-size|-cs>
1247
1248Size of the cache. Defaults to C<50m>.
1249
1250=item B<--cache-init|-ci>
1251
1252Initialize cache file.
1253Can be flagged using C<--no-cache-init> as well.
1254Defaults to C<true>.
1255
1256=item B<--cache-delete|-cd>
1257
1258Delete cache file after processing.
1259Can be flagged using C<--no-cache-delete> as well.
1260Defaults to C<true>.
1261
Akron636aa112017-04-07 18:48:56 +02001262=item B<--config|-cfg>
1263
1264Configure the parameters of your call in a file
1265of key-value pairs with whitespace separator
1266
1267 overwrite 1
1268 token DeReKo#Structure
1269 ...
1270
1271Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001272C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001273C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001274C<output>,
1275C<temp-extract>, C<sequential-extraction>,
1276C<base-sentences>, C<base-paragraphs>,
1277C<base-pagebreaks>,
1278C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001279(semicolon separated), C<anno> (semicolon separated).
1280
Akron81500102017-04-07 20:45:44 +02001281=item B<--temporary-extract|-te>
1282
1283Only valid for the C<archive> command.
1284
1285This will first extract all files into a
1286directory and then will archive.
1287If the directory is given as C<:temp:>,
1288a temporary directory is used.
1289This is especially useful to avoid
1290massive unzipping and potential
1291network latency.
Akron636aa112017-04-07 18:48:56 +02001292
Akrone10ad322016-02-27 10:54:26 +01001293=item B<--sigle|-sg>
1294
Akron20807582016-10-26 17:11:34 +02001295Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001296Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001297I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001298Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001299In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001300On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001301
Akron941c1a62016-02-23 17:41:41 +01001302=item B<--log|-l>
1303
1304The L<Log4perl> log level, defaults to C<ERROR>.
1305
1306=item B<--help|-h>
1307
1308Print this document.
1309
1310=item B<--version|-v>
1311
1312Print version information.
1313
1314=back
1315
Akronc13a1702016-03-15 19:33:14 +01001316=head1 ANNOTATION SUPPORT
1317
1318L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1319developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1320The base foundry with paragraphs, sentences, and the text element are mandatory for
1321L<Krill|https://github.com/KorAP/Krill>.
1322
Akron821db3d2017-04-06 21:19:31 +02001323 Base
1324 #Paragraphs
1325 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001326
Akron821db3d2017-04-06 21:19:31 +02001327 Connexor
1328 #Morpho
1329 #Phrase
1330 #Sentences
1331 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001332
Akron821db3d2017-04-06 21:19:31 +02001333 CoreNLP
1334 #Constituency
1335 #Morpho
1336 #NamedEntities
1337 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001338
Akronce125b62017-06-19 11:54:36 +02001339 CMC
1340 #Morpho
1341
Akron821db3d2017-04-06 21:19:31 +02001342 DeReKo
1343 #Structure
Akronc13a1702016-03-15 19:33:14 +01001344
Akron821db3d2017-04-06 21:19:31 +02001345 DRuKoLa
1346 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001347
Akron821db3d2017-04-06 21:19:31 +02001348 Glemm
1349 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001350
Akron821db3d2017-04-06 21:19:31 +02001351 Malt
1352 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001353
Akron821db3d2017-04-06 21:19:31 +02001354 MarMoT
1355 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001356
Akron821db3d2017-04-06 21:19:31 +02001357 Mate
1358 #Dependency
1359 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001360
Akron821db3d2017-04-06 21:19:31 +02001361 MDParser
1362 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001363
Akron821db3d2017-04-06 21:19:31 +02001364 OpenNLP
1365 #Morpho
1366 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001367
Akron821db3d2017-04-06 21:19:31 +02001368 Sgbr
1369 #Lemma
1370 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001371
Akron821db3d2017-04-06 21:19:31 +02001372 TreeTagger
1373 #Morpho
1374 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001375
Akron821db3d2017-04-06 21:19:31 +02001376 XIP
1377 #Constituency
1378 #Morpho
1379 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001380
Akronc13a1702016-03-15 19:33:14 +01001381
1382More importers are in preparation.
1383New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1384See the built-in annotation importers as examples.
1385
Akron941c1a62016-02-23 17:41:41 +01001386=head1 AVAILABILITY
1387
1388 https://github.com/KorAP/KorAP-XML-Krill
1389
1390
1391=head1 COPYRIGHT AND LICENSE
1392
Akron3ec0a1c2017-01-18 14:41:55 +01001393Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001394
Akron941c1a62016-02-23 17:41:41 +01001395Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001396
Akrona76d8352016-10-27 16:27:32 +02001397Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001398
1399L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1400Corpus Analysis Platform at the
1401L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1402member of the
1403L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1404
1405This program is free software published under the
1406L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1407
1408=cut