blob: c85992035ea692b18950358a8e40df2e8c32c3dd [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron941c1a62016-02-23 17:41:41 +0100126# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100127
Akron3abc03e2017-06-29 16:23:35 +0200128our $LAST_CHANGE = '2017/06/29';
Akron941c1a62016-02-23 17:41:41 +0100129our $LOCAL = $FindBin::Bin;
130our $VERSION_MSG = <<"VERSION";
131Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
132VERSION
133
Akron63f20d42017-04-10 23:40:29 +0200134# Prototypes
135sub get_file_name_from_glob($);
136sub get_file_name($);
137
Akron941c1a62016-02-23 17:41:41 +0100138# Parse comand
139my $cmd;
140our @ARGV;
141if ($ARGV[0] && index($ARGV[0], '-') != 0) {
142 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100143};
Akron63f20d42017-04-10 23:40:29 +0200144my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100145
Akron5f51d422016-08-16 16:26:43 +0200146my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100147my $text;
Akrone10ad322016-02-27 10:54:26 +0100148
Akron941c1a62016-02-23 17:41:41 +0100149# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000150GetOptions(
Akron08385f62016-03-22 20:37:04 +0100151 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200152 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100153 'output|o=s' => \(my $output),
154 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100155 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200156 'token|t=s' => \(my $token_base),
157 'base-sentences|bs=s' => \(my $base_sentences),
158 'base-paragraphs|bp=s' => \(my $base_paragraphs),
159 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100160 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200161 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100162 'skip|s=s' => \@skip,
163 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200164 'cache|c=s' => \(my $cache_file),
165 'config|cfg=s' => \(my $cfg_file),
166 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200167 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100168 'primary|p!' => \(my $primary),
169 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200170 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200171 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200172 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200173 'cache-size|cs=s' => \(my $cache_size),
174 'cache-delete|cd!' => \(my $cache_delete),
175 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100176 'help|h' => sub {
177 pod2usage(
178 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200179 -verbose => 99,
180 -msg => $VERSION_MSG,
181 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100182 );
183 },
184 'version|v' => sub {
185 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200186 -verbose => 0,
187 -msg => $VERSION_MSG,
188 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100189 )
190 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000191);
192
Akron63f20d42017-04-10 23:40:29 +0200193
Akron636aa112017-04-07 18:48:56 +0200194# Load from configuration
195if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200196 my %config;
197
198 Config::Simple->import_from($cfg_file, \%config);
199
200 # Overwrite
201 if (!defined($overwrite) && defined $config{overwrite}) {
202 $overwrite = $config{overwrite};
203 };
204
205 # Gzip
206 if (!defined($gzip) && defined $config{gzip}) {
207 $gzip = $config{gzip};
208 };
209
210 # Jobs
211 if (!defined($jobs) && defined $config{jobs}) {
212 $jobs = $config{jobs};
213 };
214
Akron63f20d42017-04-10 23:40:29 +0200215 # Input root base directory
216 if (!defined($input_base) && defined $config{'input-base'}) {
217 $input_base = $config{'input-base'};
218 };
219
Akron81500102017-04-07 20:45:44 +0200220 # temporary-extract
221 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
222 $extract_dir = $config{'temporary-extract'};
223 };
224
Akron636aa112017-04-07 18:48:56 +0200225 # Token base
226 if (!defined($token_base) && defined $config{token}) {
227 $token_base = $config{token};
228 };
229
230 # Cache file
231 if (!defined($cache_file) && defined $config{cache}) {
232 $cache_file = $config{cache};
233 };
234
235 # Cache size
236 if (!defined($cache_size) && defined $config{'cache-size'}) {
237 $cache_size = $config{'cache-size'};
238 };
239
240 # Cache delete
241 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
242 $cache_delete = $config{'cache-delete'} ;
243 };
244
245 # Cache init
246 if (!(defined $cache_init) && defined $config{'cache-init'}) {
247 $cache_init = $config{'cache-init'} ;
248 };
249
Akron9ec88872017-04-12 16:29:06 +0200250 # Jobs for extraction
251 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
252 $sequential_extraction = $config{'sequential-extraction'} ;
253 };
254
Akron636aa112017-04-07 18:48:56 +0200255 # Meta
256 if (!(defined $meta) && defined $config{'meta'}) {
257 $meta = $config{'meta'} ;
258 };
259
260 # Output
261 if (!(defined $output) && defined $config{'output'}) {
262 $output = $config{'output'} ;
263 };
264
265 # Base-sentences
266 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
267 $base_sentences = $config{'base-sentences'} ;
268 };
269
270 # Base-paragraphs
271 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
272 $base_paragraphs = $config{'base-paragraphs'} ;
273 };
274
275 # Base-pagebreaks
276 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
277 $base_pagebreaks = $config{'base-pagebreaks'} ;
278 };
279
Akron081639e2017-04-21 19:01:39 +0200280 # Write to tar
281 if (!(defined $to_tar) && defined $config{'to-tar'}) {
282 $to_tar = $config{'to-tar'} ;
283 };
284
Akron636aa112017-04-07 18:48:56 +0200285 # Log
286 if (!(defined $log_level) && defined $config{'log'}) {
287 $log_level = $config{'log'} ;
288 };
289
290 # Skip
291 if (!scalar(@skip) && defined $config{'skip'}) {
292 @skip = split /\s*;\s*/, $config{'skip'} ;
293 };
294
295 # Sigle
296 if (!scalar(@sigle) && defined $config{'sigle'}) {
297 @sigle = split /\s*;\s*/, $config{'sigle'} ;
298 };
299
300 # Anno
301 if (!scalar(@anno) && defined $config{'anno'}) {
302 @anno = split /\s*;\s*/, $config{'anno'} ;
303 };
304};
305
Akron63f20d42017-04-10 23:40:29 +0200306
Akron636aa112017-04-07 18:48:56 +0200307# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200308$token_base //= 'OpenNLP#tokens';
309$cache_file //= 'korapxml2krill.cache';
310$cache_size //= '50m';
311$jobs //= 0;
312$cache_delete //= 1;
313$cache_init //= 1;
314$sequential_extraction //= 0;
315$log_level //= 'ERROR';
316$base_sentences //= '';
317$base_paragraphs //= '';
318$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200319
Akron821db3d2017-04-06 21:19:31 +0200320$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100321$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100322$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100323
Akron63f20d42017-04-10 23:40:29 +0200324
325# Initialize log4perl object
326Log::Log4perl->init({
327 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
328 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
329 'log4perl.appender.STDERR.layout' => 'PatternLayout',
330 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
331});
332
333my $log = Log::Log4perl->get_logger('main');
334
335
336print "Reading config from $cfg_file\n" if $cfg_file;
337
338
Akron941c1a62016-02-23 17:41:41 +0100339my %ERROR_HASH = (
340 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200341 -verbose => 99,
342 -msg => $VERSION_MSG,
343 -output => '-',
344 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100345);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000346
Akron941c1a62016-02-23 17:41:41 +0100347# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100348pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349
Akrone1dbc382016-07-08 22:24:52 +0200350# Gzip has no effect, if no output is given
351pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352
Akronc11f7982017-02-21 21:20:14 +0100353
Akron636aa112017-04-07 18:48:56 +0200354if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100355 state $cores = Sys::Info->new->device('CPU')->count;
356 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200357 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100358};
359
Akron821db3d2017-04-06 21:19:31 +0200360
Akron63f20d42017-04-10 23:40:29 +0200361# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200362if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200363
Akron486f9ab2017-04-22 23:25:19 +0200364 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200365 $log->error("Directory '$output' does not exist.");
366 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200367 };
368
369 # Remove all inputs
370 my $remove_next = 0;
371 @keep_argv = @{c(@keep_argv)->grep(
372 sub {
373 # Input flag
374 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
375 $remove_next = 1;
376 return 0;
377 }
378
379 # input value
380 elsif ($remove_next) {
381 $remove_next = 0;
382 return 0;
383 };
384
385 # Pass parameter
386 return 1;
387 }
388 )->to_array};
389
390
391 # Iterate over all inputs
392 foreach (@input) {
393
Akron081639e2017-04-21 19:01:39 +0200394 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200395 my $new_out = catdir($output, get_file_name_from_glob($_));
396
Akron486f9ab2017-04-22 23:25:19 +0200397 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200398 unless ($to_tar) {
399 if (make_path($new_out) == 0 && !-d $new_out) {
400 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200401 exit 1;
Akron081639e2017-04-21 19:01:39 +0200402 };
Akron63f20d42017-04-10 23:40:29 +0200403 };
404
405 # Create archive command
406 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
407 print "Start serial processing of $_ to $new_out\n";
408
409 # Start archiving
410 system @archive_cmd;
411 };
412
Akron3abc03e2017-06-29 16:23:35 +0200413 exit;
Akron63f20d42017-04-10 23:40:29 +0200414};
415
Akrone1dbc382016-07-08 22:24:52 +0200416my %skip;
417$skip{lc($_)} = 1 foreach @skip;
418
419my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100420push(@layers, ['Base', 'Sentences']) unless $base_sentences;
421push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200422
423# Connexor
424push(@layers, ['Connexor', 'Morpho']);
425push(@layers, ['Connexor', 'Syntax']);
426push(@layers, ['Connexor', 'Phrase']);
427push(@layers, ['Connexor', 'Sentences']);
428
429# CoreNLP
430push(@layers, ['CoreNLP', 'NamedEntities']);
431push(@layers, ['CoreNLP', 'Sentences']);
432push(@layers, ['CoreNLP', 'Morpho']);
433push(@layers, ['CoreNLP', 'Constituency']);
434
Akronce125b62017-06-19 11:54:36 +0200435# CMC
436push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100437
Akrone1dbc382016-07-08 22:24:52 +0200438# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100439my @dereko_attr = ();
440if ($base_sentences eq 'dereko#structure') {
441 push @dereko_attr, 'sentences';
442};
443if ($base_paragraphs eq 'dereko#structure') {
444 push @dereko_attr, 'paragraphs';
445};
Akron636bd9c2017-02-09 17:13:00 +0100446
Akron41ac10b2017-02-08 22:47:25 +0100447if ($base_pagebreaks eq 'dereko#structure') {
448 push @dereko_attr, 'pagebreaks';
449};
450
451if ($dereko_attr[0]) {
452 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100453}
454else {
455 push(@layers, ['DeReKo', 'Structure']);
456};
Akrone1dbc382016-07-08 22:24:52 +0200457
458# Glemm
459push(@layers, ['Glemm', 'Morpho']);
460
461# Malt
462push(@layers, ['Malt', 'Dependency']);
463
464# MDParser
465push(@layers, ['MDParser', 'Dependency']);
466
467# Mate
468push(@layers, ['Mate', 'Morpho']);
469push(@layers, ['Mate', 'Dependency']);
470
471# OpenNLP
472push(@layers, ['OpenNLP', 'Morpho']);
473push(@layers, ['OpenNLP', 'Sentences']);
474
475# Schreibgebrauch
476push(@layers, ['Sgbr', 'Lemma']);
477push(@layers, ['Sgbr', 'Morpho']);
478
479# TreeTagger
480push(@layers, ['TreeTagger', 'Morpho']);
481push(@layers, ['TreeTagger', 'Sentences']);
482
483# XIP
484push(@layers, ['XIP', 'Morpho']);
485push(@layers, ['XIP', 'Constituency']);
486push(@layers, ['XIP', 'Sentences']);
487push(@layers, ['XIP', 'Dependency']);
488
Akron4fa37c32017-01-20 14:43:10 +0100489# DRuKoLa
490push(@layers, ['DRuKoLa', 'Morpho']);
491
Akron3bd942f2017-02-20 20:09:14 +0100492# Marmot
493push(@layers, ['MarMoT', 'Morpho']);
494
Akron4fa37c32017-01-20 14:43:10 +0100495
Akrone1dbc382016-07-08 22:24:52 +0200496# Check filters
497my @filtered_anno;
498if ($skip{'#all'}) {
499 foreach (@anno) {
500 push @filtered_anno, [ split('#', $_) ];
501 };
502}
503
504# Add all annotations that are not skipped
505else {
506 # Add to index file - respect skipping
507 foreach my $info (@layers) {
508 # Skip if Foundry or Foundry#Layer should be skipped
509 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
510 push @filtered_anno, $info;
511 };
512 };
513};
514
515# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200516my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
517
518# Remove file extension
519$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200520
521# TODO: This should not be initialized for batch
522my $cache = Cache::FastMmap->new(
523 share_file => $cache_file,
524 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200525 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200526);
527
Akron03b24db2016-08-16 20:54:32 +0200528# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200529my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200530 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200531 meta_type => $meta,
532 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200533 foundry => $token_base_foundry,
534 layer => $token_base_layer,
535 gzip => $gzip,
536 log => $log,
537 primary => $primary,
538 pretty => $pretty,
539 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200540);
541
Akron941c1a62016-02-23 17:41:41 +0100542# Get file name based on path information
543sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100544 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200545 if (-d $i) {
546 $i =~ s![^\/]+$!!;
547 };
Akron941c1a62016-02-23 17:41:41 +0100548 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200549
550 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200551 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100552 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100553 $file =~ tr/\//-/;
554 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200555 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100556 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000557};
558
Akron63f20d42017-04-10 23:40:29 +0200559
560sub get_file_name_from_glob ($) {
561 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200562 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200563 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
564 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
565 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
566 $glob =~ s/^-//; # Clean beginning
567 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200568 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200569 return $glob;
570};
571
572
Akrone10ad322016-02-27 10:54:26 +0100573# Convert sigle to path construct
574s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
575
Akron7d4cdd82016-08-17 21:39:45 +0200576if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200577 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200578 $log->error("Directory '$output' does not exist.");
579 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200580 };
581};
582
Akron63f20d42017-04-10 23:40:29 +0200583
584# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200585if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200586
Akron821db3d2017-04-06 21:19:31 +0200587 my @new_input = ();
588
589 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200590 foreach my $wild_card (@input) {
591
592 # Prefix with input root
593 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
594
595 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200596 };
597
Akron63f20d42017-04-10 23:40:29 +0200598 # Sort files by length
599 @input = sort { length($a) <=> length($b) } @new_input;
600
601 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200602};
603
604
Akron941c1a62016-02-23 17:41:41 +0100605# Process a single file
606unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100607 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000608
Akron941c1a62016-02-23 17:41:41 +0100609 BEGIN {
610 $main::TIME = Benchmark->new;
611 $main::LAST_STOP = Benchmark->new;
612 };
613
614 sub stop_time {
615 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200616 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100617 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200618 timestr(timediff($new, $main::LAST_STOP)) .
619 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
620 );
Akron941c1a62016-02-23 17:41:41 +0100621 $main::LAST_STOP = $new;
622 };
623
624 # Create and parse new document
625 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100626
Akron7d4cdd82016-08-17 21:39:45 +0200627 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200628 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100629
Akron11c80302016-03-18 19:44:43 +0100630 # Delete cache file
631 unlink($cache_file) if $cache_delete;
632
Akron5f51d422016-08-16 16:26:43 +0200633 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200634 exit;
Akron81500102017-04-07 20:45:44 +0200635};
636
Nils Diewald59094f22014-11-05 18:20:50 +0000637
Akrone10ad322016-02-27 10:54:26 +0100638# Extract XML files
Akron81500102017-04-07 20:45:44 +0200639if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100640
Akrond5643ad2017-07-04 20:27:13 +0200641 # Output is required
642 pod2usage(%ERROR_HASH) unless $output;
643
Akron7d4cdd82016-08-17 21:39:45 +0200644 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200645 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100646
Akron7d4cdd82016-08-17 21:39:45 +0200647 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100648 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200649 $log->error("Unzip is not installed or incompatible.");
650 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100651 };
652
Akronb0c88db2016-06-29 16:33:18 +0200653 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200654 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200655
Akron651cb8d2016-08-16 21:44:49 +0200656 my $prefix = 1;
657
Akron03b24db2016-08-16 20:54:32 +0200658 # No sigles given
659 unless (@sigle) {
660
661 # Get files
662 foreach ($archive->list_texts) {
663
664 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200665 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200666
667 # TODO: Make this OS independent
668 push @sigle, join '/', $corpus, $doc, $text;
669 };
Akron20807582016-10-26 17:11:34 +0200670 }
671
672 # Check sigle for doc sigles
673 else {
674 my @new_sigle;
675
676 my $prefix_check = 0;
677
678 # Iterate over all sigle
679 foreach (@sigle) {
680
681 # Sigle is a doc sigle
682 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200683
Akron60a8caa2017-02-17 21:51:27 +0100684 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200685 # Check if a prefix is needed
686 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100687
688 if ($prefix = $archive->check_prefix) {
689 print " with prefix ...";
690 };
Akron20807582016-10-26 17:11:34 +0200691 $prefix_check = 1;
692 };
693
Akron60a8caa2017-02-17 21:51:27 +0100694 print "\n";
695
Akron20807582016-10-26 17:11:34 +0200696 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200697 my $path = ($prefix ? './' : '') . $_;
698
699 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200700 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200701 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200702 ) ? '' : 'not '
703 );
704 print "extracted.\n";
705 }
Akron60a8caa2017-02-17 21:51:27 +0100706
707 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200708 else {
709 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100710
711 unless ($prefix_check) {
712
713 if ($prefix = $archive->check_prefix) {
714 print " with prefix ...";
715 };
716 $prefix_check = 1;
717 };
Akron20807582016-10-26 17:11:34 +0200718 };
719 };
720 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200721 };
722
Akrone10ad322016-02-27 10:54:26 +0100723 # Iterate over all given sigles and extract
724 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100725
Akron2812ba22016-10-28 21:55:59 +0200726 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200727
Akron03b24db2016-08-16 20:54:32 +0200728 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200729 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100730
Akron20807582016-10-26 17:11:34 +0200731 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200732 ($prefix ? './' : '') . $_, $output
733 ) ? '' : 'not '
734 );
Akrone10ad322016-02-27 10:54:26 +0100735 print "extracted.\n";
736 };
Akronb0c88db2016-06-29 16:33:18 +0200737 }
Akron7d4cdd82016-08-17 21:39:45 +0200738
739 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200740 else {
741 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200742 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100743 };
744}
745
Akron81500102017-04-07 20:45:44 +0200746
Akron941c1a62016-02-23 17:41:41 +0100747# Process an archive
748elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000749
Akron81500102017-04-07 20:45:44 +0200750 my $archive_output;
751
752 # First extract, then archive
753 if (defined $extract_dir) {
754
755 # Create new archive object
756 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
757
758 # Check zip capabilities
759 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200760 $log->error("Unzip is not installed or incompatible.");
761 exit 1;
Akron81500102017-04-07 20:45:44 +0200762 };
763
764 # Add further annotation archived
765 $archive->attach($_) foreach @input[1..$#input];
766
767 # Create a temporary directory
768 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200769 $extract_dir = tempdir(CLEANUP => 0);
770 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200771 };
772
Akron63f20d42017-04-10 23:40:29 +0200773 # Add some random extra to avoid clashes with multiple archives
774 $extract_dir = catdir($extract_dir, random_string('cccccc'));
775
776 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200777 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200778 @input = ($extract_dir);
779 }
780 else {
781 $log->error('Unable to extract from primary archive ' . $input[0] .
782 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200783 exit 1;
Akron81500102017-04-07 20:45:44 +0200784 };
785 }
786
787 # Can't create archive object
788 else {
789 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200790 exit 1;
Akron81500102017-04-07 20:45:44 +0200791 };
792 };
793
Akrone1dbc382016-07-08 22:24:52 +0200794 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100795
Akron7d4cdd82016-08-17 21:39:45 +0200796 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100797 my $pool = Parallel::ForkManager->new($jobs);
798
Akron7d4cdd82016-08-17 21:39:45 +0200799 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100800 my $iter = 1; # Current text in process
801
Akronda3097e2017-04-23 19:53:57 +0200802 my $tar_archive;
803 my $output_dir = $output;
804 my $tar_fh;
805
806 # Initialize tar archive
807 if ($to_tar) {
808 $tar_archive = Archive::Tar::Builder->new(
809 ignore_errors => 1
810 );
811
812 # Set output name
813 my $tar_file = $output;
814 unless ($tar_file =~ /\.tar$/) {
815 $tar_file .= '.tar';
816 };
817
818 # Initiate the tar file
819 print "Writing to file $tar_file\n";
820 $tar_fh = IO::File->new($tar_file, 'w');
821 $tar_fh->binmode(1);
822
823 # Set handle
824 $tar_archive->set_handle($tar_fh);
825
826 # Output to temporary directory
827 $output_dir = File::Temp->newdir;
828 };
829
Akron941c1a62016-02-23 17:41:41 +0100830 # Report on fork message
831 $pool->run_on_finish (
832 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200833 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100834 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200835
Akron08385f62016-03-22 20:37:04 +0100836 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200837 ($iter++) . "/$count]" .
838 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200839 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200840
841 if (!$code && $to_tar && $data->[2]) {
842 my $filename = $data->[2];
843
844 # Lock filehandle
845 if (flock($tar_fh, LOCK_EX)) {
846
Akron9a062ce2017-07-04 19:12:05 +0200847 my $clean_file = fileparse($filename);
848
Akronda3097e2017-04-23 19:53:57 +0200849 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200850 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200851 unlink $filename;
852
853 # Unlock filehandle
854 flock($tar_fh, LOCK_UN);
855 }
856 else {
857 $log->warn("Unable to add $filename to archive");
858 };
859 };
860
Akron4c0cf312016-10-15 16:42:09 +0200861 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100862 }
863 );
864
865 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200866 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100867 print "Reading data ...\n";
868
Akron7d4cdd82016-08-17 21:39:45 +0200869 # unless (Cache::FastMmap->new(
870 # share_file => $cache_file,
871 # cache_size => $cache_size,
872 # init_file => $cache_init
873 # )) {
874 # print "Unable to intialize cache '$cache_file'\n\n";
875 # exit(1);
876 # };
Akron11c80302016-03-18 19:44:43 +0100877
Akron486f9ab2017-04-22 23:25:19 +0200878
Akron941c1a62016-02-23 17:41:41 +0100879 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100880 if (-d $input[0]) {
881 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100882 my @dirs;
883 my $dir;
884
Akron7d4cdd82016-08-17 21:39:45 +0200885 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100886 while (1) {
887 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200888 push @dirs, $dir;
889 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100890 };
891 last unless $it->next;
892 };
893
894 print "Start processing ...\n";
895 $t = Benchmark->new;
896 $count = scalar @dirs;
897
898 DIRECTORY_LOOP:
899 for (my $i = 0; $i < $count; $i++) {
900
Akrone1dbc382016-07-08 22:24:52 +0200901 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200902 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200903 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200904 );
Akron941c1a62016-02-23 17:41:41 +0100905
906 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200907 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200908
Akron13d56622016-10-31 14:54:49 +0100909 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200910 $pool->finish(
911 0,
Akronda3097e2017-04-23 19:53:57 +0200912 [
913 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
914 undef,
915 $filename
916 ]
Akron486f9ab2017-04-22 23:25:19 +0200917 );
Akron3ec48972016-08-17 23:24:52 +0200918 }
919 else {
Akron4c0cf312016-10-15 16:42:09 +0200920 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200921 };
Akron941c1a62016-02-23 17:41:41 +0100922 };
923 }
924
925 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200926 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200927
Akron941c1a62016-02-23 17:41:41 +0100928 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200929 $log->error("Unzip is not installed or incompatible.");
930 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100931 };
932
Akron08385f62016-03-22 20:37:04 +0100933 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200934 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100935
Akron941c1a62016-02-23 17:41:41 +0100936 print "Start processing ...\n";
937 $t = Benchmark->new;
938 my @dirs = $archive->list_texts;
939 $count = scalar @dirs;
940
941 ARCHIVE_LOOP:
942 for (my $i = 0; $i < $count; $i++) {
943
944 # Split path information
945 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
946
Akrone1dbc382016-07-08 22:24:52 +0200947 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200948 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200949 get_file_name(
950 catfile($corpus, $doc, $text)
951 . '.json' . ($gzip ? '.gz' : '')
952 )
Akrone1dbc382016-07-08 22:24:52 +0200953 );
Akron941c1a62016-02-23 17:41:41 +0100954
955 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200956 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100957
Akron4c0cf312016-10-15 16:42:09 +0200958 # Create temporary file
959 $temp = File::Temp->newdir;
960
Akronbdf434a2016-10-24 17:42:07 +0200961 # TODO: Check if $filename exist at the beginning,
962 # because extraction can be horrible slow!
963
Akron941c1a62016-02-23 17:41:41 +0100964 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200965 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100966
Akron7d4cdd82016-08-17 21:39:45 +0200967 # Create corpus directory
968 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100969
Akron7d4cdd82016-08-17 21:39:45 +0200970 # Temporary directory
971 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100972
Akron7d4cdd82016-08-17 21:39:45 +0200973 # Write file
Akron13d56622016-10-31 14:54:49 +0100974 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200975
Akron4c0cf312016-10-15 16:42:09 +0200976 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100977 $pool->finish(
978 0,
Akronda3097e2017-04-23 19:53:57 +0200979 [
980 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
981 $temp,
982 $filename
983 ]
Akron13d56622016-10-31 14:54:49 +0100984 );
985 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200986 }
987 else {
Akron4c0cf312016-10-15 16:42:09 +0200988 # Delete temporary file
989 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200990 };
Akron941c1a62016-02-23 17:41:41 +0100991 }
Akron7d4cdd82016-08-17 21:39:45 +0200992
993 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100994 else {
Akron4c0cf312016-10-15 16:42:09 +0200995 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100996 };
997 };
998 }
999
1000 else {
1001 print "Input is neither a directory nor an archive.\n\n";
1002 };
1003
1004 $pool->wait_all_children;
1005
Akron11c80302016-03-18 19:44:43 +01001006 # Delete cache file
1007 unlink($cache_file) if $cache_delete;
1008
Akronda3097e2017-04-23 19:53:57 +02001009 # Close tar filehandle
1010 if ($to_tar && $tar_fh) {
1011 $tar_archive->finish;
1012 $tar_fh->close;
1013 print "Wrote to tar archive.\n";
1014 };
1015
Akron63f20d42017-04-10 23:40:29 +02001016 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001017 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001018};
Akron941c1a62016-02-23 17:41:41 +01001019
Nils Diewald2db9ad02013-10-29 19:26:43 +00001020
Akron63f20d42017-04-10 23:40:29 +02001021# Cleanup temporary extraction directory
1022if ($extract_dir) {
1023 my $objects = remove_tree($extract_dir, { safe => 1 });
1024 print "Removed directory $extract_dir with $objects objects.\n";
1025};
1026
1027
1028print "\n";
1029
Nils Diewald2db9ad02013-10-29 19:26:43 +00001030__END__
Akron941c1a62016-02-23 17:41:41 +01001031
1032=pod
1033
1034=encoding utf8
1035
1036=head1 NAME
1037
Akronf7ad89e2016-03-16 18:22:47 +01001038korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001039
1040
1041=head1 SYNOPSIS
1042
Akrona76d8352016-10-27 16:27:32 +02001043 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001044
Akron2fd402b2016-10-27 21:26:48 +02001045
Akron941c1a62016-02-23 17:41:41 +01001046=head1 DESCRIPTION
1047
1048L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1049compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001050The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001051
1052
1053=head1 INSTALLATION
1054
1055The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1056
Akronaf386982016-10-12 00:33:25 +02001057 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001058
Akronc13a1702016-03-15 19:33:14 +01001059In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001060be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001061Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001062In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001063
1064=head1 ARGUMENTS
1065
Akrona76d8352016-10-27 16:27:32 +02001066 $ korapxml2krill -z --input <directory> --output <filename>
1067
1068Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001069It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001070
Akron941c1a62016-02-23 17:41:41 +01001071=over 2
1072
1073=item B<archive>
1074
Akron081639e2017-04-21 19:01:39 +02001075 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001076
Akron2fd402b2016-10-27 21:26:48 +02001077Converts an archive of KorAP-XML documents. It expects a directory
1078(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001079
1080=item B<extract>
1081
Akrona76d8352016-10-27 16:27:32 +02001082 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1083
1084Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001085
Akron63f20d42017-04-10 23:40:29 +02001086=item B<serial>
1087
1088 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1089
1090Convert archives sequentially. The inputs are not merged but treated
1091as they are (so they may be premerged or globs).
1092the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001093are created based on the archive name. In case the C<--to-tar> flag is given,
1094the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001095
1096
Akron941c1a62016-02-23 17:41:41 +01001097=back
1098
1099
1100=head1 OPTIONS
1101
1102=over 2
1103
Akrona76d8352016-10-27 16:27:32 +02001104=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001105
Akrona76d8352016-10-27 16:27:32 +02001106Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001107
Akron7606afa2016-10-25 16:23:49 +02001108Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001109document, while C<archive> expects a KorAP-XML corpus folder or a zip
1110file to batch process multiple files.
1111C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001112
Akrona76d8352016-10-27 16:27:32 +02001113C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001114that the first archive listed contains all primary data files
1115and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001116
Akron7606afa2016-10-25 16:23:49 +02001117 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001118
Akron821db3d2017-04-06 21:19:31 +02001119Input may also be defined using BSD glob wildcards.
1120
1121 -i 'file/news*.zip'
1122
1123The extended input array will be sorted in length order, so the shortest
1124path needs to contain all primary data files and all meta data files.
1125
Akron0c3e3752016-06-28 15:55:53 +02001126(The directory structure follows the base directory format,
1127that may include a C<.> root folder.
1128In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001129need to be passed with a hash sign in front of the archive's name.
1130This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001131
Akron7606afa2016-10-25 16:23:49 +02001132To support zip files, a version of C<unzip> needs to be installed that is
1133compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001134
Akron7606afa2016-10-25 16:23:49 +02001135B<The root folder switch using the hash sign is experimental and
1136may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001137
Akron63f20d42017-04-10 23:40:29 +02001138=item B<--input-base|-ib> <directory>
1139
1140The base directory for inputs.
1141
1142
Akron941c1a62016-02-23 17:41:41 +01001143=item B<--output|-o> <directory|file>
1144
1145Output folder for archive processing or
1146document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001147writes to C<STDOUT> by default
1148(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001149
1150=item B<--overwrite|-w>
1151
1152Overwrite files that already exist.
1153
Akron3741f8b2016-12-21 19:55:21 +01001154=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001155
1156Define the default tokenization by specifying
1157the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001158of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001159
Akron3741f8b2016-12-21 19:55:21 +01001160
1161=item B<--base-sentences|-bs> <foundry>#<layer>
1162
1163Define the layer for base sentences.
1164If given, this will be used instead of using C<Base#Sentences>.
1165Currently C<DeReKo#Structure> is the only additional layer supported.
1166
1167 Defaults to unset.
1168
1169
1170=item B<--base-paragraphs|-bp> <foundry>#<layer>
1171
1172Define the layer for base paragraphs.
1173If given, this will be used instead of using C<Base#Paragraphs>.
1174Currently C<DeReKo#Structure> is the only additional layer supported.
1175
1176 Defaults to unset.
1177
1178
Akron41ac10b2017-02-08 22:47:25 +01001179=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1180
1181Define the layer for base pagebreaks.
1182Currently C<DeReKo#Structure> is the only layer supported.
1183
1184 Defaults to unset.
1185
1186
Akron941c1a62016-02-23 17:41:41 +01001187=item B<--skip|-s> <foundry>[#<layer>]
1188
Akronf7ad89e2016-03-16 18:22:47 +01001189Skip specific annotations by specifying the foundry
1190(and optionally the layer with a C<#>-prefix),
1191e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001192Can be set multiple times.
1193
Akronc13a1702016-03-15 19:33:14 +01001194=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001195
Akronf7ad89e2016-03-16 18:22:47 +01001196Convert specific annotations by specifying the foundry
1197(and optionally the layer with a C<#>-prefix),
1198e.g. C<Mate> or C<Mate#Morpho>.
1199Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001200
1201=item B<--primary|-p>
1202
Akronc13a1702016-03-15 19:33:14 +01001203Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001204Can be flagged using C<--no-primary> as well.
1205This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001206
1207=item B<--jobs|-j>
1208
1209Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001210for archive processing.
Akron11c80302016-03-18 19:44:43 +01001211Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001212
1213If C<sequential-extraction> is not set to false, this will
1214also apply to extraction.
1215
Akronc11f7982017-02-21 21:20:14 +01001216Pass -1, and the value will be set automatically to 5
1217times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001218This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001219
Akron9ec88872017-04-12 16:29:06 +02001220=item B<--sequential-extraction|-se>
1221
1222Flag to indicate, if the C<jobs> value also applies to extraction.
1223Some systems may have problems with extracting multiple archives
1224to the same folder at the same time.
1225Can be flagged using C<--no-sequential-extraction> as well.
1226Defaults to C<false>.
1227
Akron35db6e32016-03-17 22:42:22 +01001228=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001229
Akron35db6e32016-03-17 22:42:22 +01001230Define the metadata parser to use. Defaults to C<I5>.
1231Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1232This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001233
1234=item B<--pretty|-y>
1235
Akronc13a1702016-03-15 19:33:14 +01001236Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001237This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001238
1239=item B<--gzip|-z>
1240
Akronf7ad89e2016-03-16 18:22:47 +01001241Compress the output.
1242Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001243
Akron11c80302016-03-18 19:44:43 +01001244=item B<--cache|-c>
1245
1246File to mmap a cache (using L<Cache::FastMmap>).
1247Defaults to C<korapxml2krill.cache> in the calling directory.
1248
1249=item B<--cache-size|-cs>
1250
1251Size of the cache. Defaults to C<50m>.
1252
1253=item B<--cache-init|-ci>
1254
1255Initialize cache file.
1256Can be flagged using C<--no-cache-init> as well.
1257Defaults to C<true>.
1258
1259=item B<--cache-delete|-cd>
1260
1261Delete cache file after processing.
1262Can be flagged using C<--no-cache-delete> as well.
1263Defaults to C<true>.
1264
Akron636aa112017-04-07 18:48:56 +02001265=item B<--config|-cfg>
1266
1267Configure the parameters of your call in a file
1268of key-value pairs with whitespace separator
1269
1270 overwrite 1
1271 token DeReKo#Structure
1272 ...
1273
1274Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001275C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001276C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001277C<output>,
1278C<temp-extract>, C<sequential-extraction>,
1279C<base-sentences>, C<base-paragraphs>,
1280C<base-pagebreaks>,
1281C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001282(semicolon separated), C<anno> (semicolon separated).
1283
Akron81500102017-04-07 20:45:44 +02001284=item B<--temporary-extract|-te>
1285
1286Only valid for the C<archive> command.
1287
1288This will first extract all files into a
1289directory and then will archive.
1290If the directory is given as C<:temp:>,
1291a temporary directory is used.
1292This is especially useful to avoid
1293massive unzipping and potential
1294network latency.
Akron636aa112017-04-07 18:48:56 +02001295
Akrone10ad322016-02-27 10:54:26 +01001296=item B<--sigle|-sg>
1297
Akron20807582016-10-26 17:11:34 +02001298Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001299Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001300I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001301Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001302In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001303On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001304
Akron941c1a62016-02-23 17:41:41 +01001305=item B<--log|-l>
1306
1307The L<Log4perl> log level, defaults to C<ERROR>.
1308
1309=item B<--help|-h>
1310
1311Print this document.
1312
1313=item B<--version|-v>
1314
1315Print version information.
1316
1317=back
1318
Akronc13a1702016-03-15 19:33:14 +01001319=head1 ANNOTATION SUPPORT
1320
1321L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1322developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1323The base foundry with paragraphs, sentences, and the text element are mandatory for
1324L<Krill|https://github.com/KorAP/Krill>.
1325
Akron821db3d2017-04-06 21:19:31 +02001326 Base
1327 #Paragraphs
1328 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001329
Akron821db3d2017-04-06 21:19:31 +02001330 Connexor
1331 #Morpho
1332 #Phrase
1333 #Sentences
1334 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001335
Akron821db3d2017-04-06 21:19:31 +02001336 CoreNLP
1337 #Constituency
1338 #Morpho
1339 #NamedEntities
1340 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001341
Akronce125b62017-06-19 11:54:36 +02001342 CMC
1343 #Morpho
1344
Akron821db3d2017-04-06 21:19:31 +02001345 DeReKo
1346 #Structure
Akronc13a1702016-03-15 19:33:14 +01001347
Akron821db3d2017-04-06 21:19:31 +02001348 DRuKoLa
1349 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001350
Akron821db3d2017-04-06 21:19:31 +02001351 Glemm
1352 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001353
Akron821db3d2017-04-06 21:19:31 +02001354 Malt
1355 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001356
Akron821db3d2017-04-06 21:19:31 +02001357 MarMoT
1358 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001359
Akron821db3d2017-04-06 21:19:31 +02001360 Mate
1361 #Dependency
1362 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001363
Akron821db3d2017-04-06 21:19:31 +02001364 MDParser
1365 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001366
Akron821db3d2017-04-06 21:19:31 +02001367 OpenNLP
1368 #Morpho
1369 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001370
Akron821db3d2017-04-06 21:19:31 +02001371 Sgbr
1372 #Lemma
1373 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001374
Akron821db3d2017-04-06 21:19:31 +02001375 TreeTagger
1376 #Morpho
1377 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001378
Akron821db3d2017-04-06 21:19:31 +02001379 XIP
1380 #Constituency
1381 #Morpho
1382 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001383
Akronc13a1702016-03-15 19:33:14 +01001384
1385More importers are in preparation.
1386New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1387See the built-in annotation importers as examples.
1388
Akron941c1a62016-02-23 17:41:41 +01001389=head1 AVAILABILITY
1390
1391 https://github.com/KorAP/KorAP-XML-Krill
1392
1393
1394=head1 COPYRIGHT AND LICENSE
1395
Akron3ec0a1c2017-01-18 14:41:55 +01001396Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001397
Akron941c1a62016-02-23 17:41:41 +01001398Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001399
Akrona76d8352016-10-27 16:27:32 +02001400Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001401
1402L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1403Corpus Analysis Platform at the
1404L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1405member of the
1406L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1407
1408This program is free software published under the
1409L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1410
1411=cut