blob: 502727b30d8d880a077677988e5a55a00f8c7a73 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron941c1a62016-02-23 17:41:41 +0100126# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100127
Akron3abc03e2017-06-29 16:23:35 +0200128our $LAST_CHANGE = '2017/06/29';
Akron941c1a62016-02-23 17:41:41 +0100129our $LOCAL = $FindBin::Bin;
130our $VERSION_MSG = <<"VERSION";
131Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
132VERSION
133
Akron63f20d42017-04-10 23:40:29 +0200134# Prototypes
135sub get_file_name_from_glob($);
136sub get_file_name($);
137
Akron941c1a62016-02-23 17:41:41 +0100138# Parse comand
139my $cmd;
140our @ARGV;
141if ($ARGV[0] && index($ARGV[0], '-') != 0) {
142 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100143};
Akron63f20d42017-04-10 23:40:29 +0200144my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100145
Akron5f51d422016-08-16 16:26:43 +0200146my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100147my $text;
Akrone10ad322016-02-27 10:54:26 +0100148
Akron941c1a62016-02-23 17:41:41 +0100149# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000150GetOptions(
Akron08385f62016-03-22 20:37:04 +0100151 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200152 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100153 'output|o=s' => \(my $output),
154 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100155 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200156 'token|t=s' => \(my $token_base),
157 'base-sentences|bs=s' => \(my $base_sentences),
158 'base-paragraphs|bp=s' => \(my $base_paragraphs),
159 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100160 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200161 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100162 'skip|s=s' => \@skip,
163 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200164 'cache|c=s' => \(my $cache_file),
165 'config|cfg=s' => \(my $cfg_file),
166 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200167 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100168 'primary|p!' => \(my $primary),
169 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200170 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200171 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200172 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200173 'cache-size|cs=s' => \(my $cache_size),
174 'cache-delete|cd!' => \(my $cache_delete),
175 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100176 'help|h' => sub {
177 pod2usage(
178 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200179 -verbose => 99,
180 -msg => $VERSION_MSG,
181 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100182 );
183 },
184 'version|v' => sub {
185 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200186 -verbose => 0,
187 -msg => $VERSION_MSG,
188 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100189 )
190 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000191);
192
Akron63f20d42017-04-10 23:40:29 +0200193
Akron636aa112017-04-07 18:48:56 +0200194# Load from configuration
195if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200196 my %config;
197
198 Config::Simple->import_from($cfg_file, \%config);
199
200 # Overwrite
201 if (!defined($overwrite) && defined $config{overwrite}) {
202 $overwrite = $config{overwrite};
203 };
204
205 # Gzip
206 if (!defined($gzip) && defined $config{gzip}) {
207 $gzip = $config{gzip};
208 };
209
210 # Jobs
211 if (!defined($jobs) && defined $config{jobs}) {
212 $jobs = $config{jobs};
213 };
214
Akron63f20d42017-04-10 23:40:29 +0200215 # Input root base directory
216 if (!defined($input_base) && defined $config{'input-base'}) {
217 $input_base = $config{'input-base'};
218 };
219
Akron81500102017-04-07 20:45:44 +0200220 # temporary-extract
221 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
222 $extract_dir = $config{'temporary-extract'};
223 };
224
Akron636aa112017-04-07 18:48:56 +0200225 # Token base
226 if (!defined($token_base) && defined $config{token}) {
227 $token_base = $config{token};
228 };
229
230 # Cache file
231 if (!defined($cache_file) && defined $config{cache}) {
232 $cache_file = $config{cache};
233 };
234
235 # Cache size
236 if (!defined($cache_size) && defined $config{'cache-size'}) {
237 $cache_size = $config{'cache-size'};
238 };
239
240 # Cache delete
241 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
242 $cache_delete = $config{'cache-delete'} ;
243 };
244
245 # Cache init
246 if (!(defined $cache_init) && defined $config{'cache-init'}) {
247 $cache_init = $config{'cache-init'} ;
248 };
249
Akron9ec88872017-04-12 16:29:06 +0200250 # Jobs for extraction
251 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
252 $sequential_extraction = $config{'sequential-extraction'} ;
253 };
254
Akron636aa112017-04-07 18:48:56 +0200255 # Meta
256 if (!(defined $meta) && defined $config{'meta'}) {
257 $meta = $config{'meta'} ;
258 };
259
260 # Output
261 if (!(defined $output) && defined $config{'output'}) {
262 $output = $config{'output'} ;
263 };
264
265 # Base-sentences
266 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
267 $base_sentences = $config{'base-sentences'} ;
268 };
269
270 # Base-paragraphs
271 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
272 $base_paragraphs = $config{'base-paragraphs'} ;
273 };
274
275 # Base-pagebreaks
276 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
277 $base_pagebreaks = $config{'base-pagebreaks'} ;
278 };
279
Akron081639e2017-04-21 19:01:39 +0200280 # Write to tar
281 if (!(defined $to_tar) && defined $config{'to-tar'}) {
282 $to_tar = $config{'to-tar'} ;
283 };
284
Akron636aa112017-04-07 18:48:56 +0200285 # Log
286 if (!(defined $log_level) && defined $config{'log'}) {
287 $log_level = $config{'log'} ;
288 };
289
290 # Skip
291 if (!scalar(@skip) && defined $config{'skip'}) {
292 @skip = split /\s*;\s*/, $config{'skip'} ;
293 };
294
295 # Sigle
296 if (!scalar(@sigle) && defined $config{'sigle'}) {
297 @sigle = split /\s*;\s*/, $config{'sigle'} ;
298 };
299
300 # Anno
301 if (!scalar(@anno) && defined $config{'anno'}) {
302 @anno = split /\s*;\s*/, $config{'anno'} ;
303 };
304};
305
Akron63f20d42017-04-10 23:40:29 +0200306
Akron636aa112017-04-07 18:48:56 +0200307# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200308$token_base //= 'OpenNLP#tokens';
309$cache_file //= 'korapxml2krill.cache';
310$cache_size //= '50m';
311$jobs //= 0;
312$cache_delete //= 1;
313$cache_init //= 1;
314$sequential_extraction //= 0;
315$log_level //= 'ERROR';
316$base_sentences //= '';
317$base_paragraphs //= '';
318$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200319
Akron821db3d2017-04-06 21:19:31 +0200320$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100321$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100322$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100323
Akron63f20d42017-04-10 23:40:29 +0200324
325# Initialize log4perl object
326Log::Log4perl->init({
327 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
328 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
329 'log4perl.appender.STDERR.layout' => 'PatternLayout',
330 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
331});
332
333my $log = Log::Log4perl->get_logger('main');
334
335
336print "Reading config from $cfg_file\n" if $cfg_file;
337
338
Akron941c1a62016-02-23 17:41:41 +0100339my %ERROR_HASH = (
340 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200341 -verbose => 99,
342 -msg => $VERSION_MSG,
343 -output => '-',
344 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100345);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000346
Akron941c1a62016-02-23 17:41:41 +0100347# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100348pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349
Akrone1dbc382016-07-08 22:24:52 +0200350# Gzip has no effect, if no output is given
351pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352
Akronc11f7982017-02-21 21:20:14 +0100353
Akron636aa112017-04-07 18:48:56 +0200354if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100355 state $cores = Sys::Info->new->device('CPU')->count;
356 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200357 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100358};
359
Akron821db3d2017-04-06 21:19:31 +0200360
Akron63f20d42017-04-10 23:40:29 +0200361# Start serial processing
362if ($cmd eq 'serial') {
363
Akron486f9ab2017-04-22 23:25:19 +0200364 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200365 $log->error("Directory '$output' does not exist.");
366 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200367 };
368
369 # Remove all inputs
370 my $remove_next = 0;
371 @keep_argv = @{c(@keep_argv)->grep(
372 sub {
373 # Input flag
374 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
375 $remove_next = 1;
376 return 0;
377 }
378
379 # input value
380 elsif ($remove_next) {
381 $remove_next = 0;
382 return 0;
383 };
384
385 # Pass parameter
386 return 1;
387 }
388 )->to_array};
389
390
391 # Iterate over all inputs
392 foreach (@input) {
393
Akron081639e2017-04-21 19:01:39 +0200394 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200395 my $new_out = catdir($output, get_file_name_from_glob($_));
396
Akron486f9ab2017-04-22 23:25:19 +0200397 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200398 unless ($to_tar) {
399 if (make_path($new_out) == 0 && !-d $new_out) {
400 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200401 exit 1;
Akron081639e2017-04-21 19:01:39 +0200402 };
Akron63f20d42017-04-10 23:40:29 +0200403 };
404
405 # Create archive command
406 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
407 print "Start serial processing of $_ to $new_out\n";
408
409 # Start archiving
410 system @archive_cmd;
411 };
412
Akron3abc03e2017-06-29 16:23:35 +0200413 exit;
Akron63f20d42017-04-10 23:40:29 +0200414};
415
Akrone1dbc382016-07-08 22:24:52 +0200416my %skip;
417$skip{lc($_)} = 1 foreach @skip;
418
419my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100420push(@layers, ['Base', 'Sentences']) unless $base_sentences;
421push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200422
423# Connexor
424push(@layers, ['Connexor', 'Morpho']);
425push(@layers, ['Connexor', 'Syntax']);
426push(@layers, ['Connexor', 'Phrase']);
427push(@layers, ['Connexor', 'Sentences']);
428
429# CoreNLP
430push(@layers, ['CoreNLP', 'NamedEntities']);
431push(@layers, ['CoreNLP', 'Sentences']);
432push(@layers, ['CoreNLP', 'Morpho']);
433push(@layers, ['CoreNLP', 'Constituency']);
434
Akronce125b62017-06-19 11:54:36 +0200435# CMC
436push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100437
Akrone1dbc382016-07-08 22:24:52 +0200438# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100439my @dereko_attr = ();
440if ($base_sentences eq 'dereko#structure') {
441 push @dereko_attr, 'sentences';
442};
443if ($base_paragraphs eq 'dereko#structure') {
444 push @dereko_attr, 'paragraphs';
445};
Akron636bd9c2017-02-09 17:13:00 +0100446
Akron41ac10b2017-02-08 22:47:25 +0100447if ($base_pagebreaks eq 'dereko#structure') {
448 push @dereko_attr, 'pagebreaks';
449};
450
451if ($dereko_attr[0]) {
452 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100453}
454else {
455 push(@layers, ['DeReKo', 'Structure']);
456};
Akrone1dbc382016-07-08 22:24:52 +0200457
458# Glemm
459push(@layers, ['Glemm', 'Morpho']);
460
461# Malt
462push(@layers, ['Malt', 'Dependency']);
463
464# MDParser
465push(@layers, ['MDParser', 'Dependency']);
466
467# Mate
468push(@layers, ['Mate', 'Morpho']);
469push(@layers, ['Mate', 'Dependency']);
470
471# OpenNLP
472push(@layers, ['OpenNLP', 'Morpho']);
473push(@layers, ['OpenNLP', 'Sentences']);
474
475# Schreibgebrauch
476push(@layers, ['Sgbr', 'Lemma']);
477push(@layers, ['Sgbr', 'Morpho']);
478
479# TreeTagger
480push(@layers, ['TreeTagger', 'Morpho']);
481push(@layers, ['TreeTagger', 'Sentences']);
482
483# XIP
484push(@layers, ['XIP', 'Morpho']);
485push(@layers, ['XIP', 'Constituency']);
486push(@layers, ['XIP', 'Sentences']);
487push(@layers, ['XIP', 'Dependency']);
488
Akron4fa37c32017-01-20 14:43:10 +0100489# DRuKoLa
490push(@layers, ['DRuKoLa', 'Morpho']);
491
Akron3bd942f2017-02-20 20:09:14 +0100492# Marmot
493push(@layers, ['MarMoT', 'Morpho']);
494
Akron4fa37c32017-01-20 14:43:10 +0100495
Akrone1dbc382016-07-08 22:24:52 +0200496# Check filters
497my @filtered_anno;
498if ($skip{'#all'}) {
499 foreach (@anno) {
500 push @filtered_anno, [ split('#', $_) ];
501 };
502}
503
504# Add all annotations that are not skipped
505else {
506 # Add to index file - respect skipping
507 foreach my $info (@layers) {
508 # Skip if Foundry or Foundry#Layer should be skipped
509 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
510 push @filtered_anno, $info;
511 };
512 };
513};
514
515# Get tokenization basis
516my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
517
518# TODO: This should not be initialized for batch
519my $cache = Cache::FastMmap->new(
520 share_file => $cache_file,
521 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200522 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200523);
524
Akron03b24db2016-08-16 20:54:32 +0200525# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200526my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200527 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200528 meta_type => $meta,
529 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200530 foundry => $token_base_foundry,
531 layer => $token_base_layer,
532 gzip => $gzip,
533 log => $log,
534 primary => $primary,
535 pretty => $pretty,
536 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200537);
538
Akron941c1a62016-02-23 17:41:41 +0100539# Get file name based on path information
540sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100541 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200542 if (-d $i) {
543 $i =~ s![^\/]+$!!;
544 };
Akron941c1a62016-02-23 17:41:41 +0100545 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200546
547 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200548 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100549 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100550 $file =~ tr/\//-/;
551 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200552 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100553 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000554};
555
Akron63f20d42017-04-10 23:40:29 +0200556
557sub get_file_name_from_glob ($) {
558 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200559 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200560 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
561 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
562 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
563 $glob =~ s/^-//; # Clean beginning
564 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200565 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200566 return $glob;
567};
568
569
Akrone10ad322016-02-27 10:54:26 +0100570# Convert sigle to path construct
571s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
572
Akron7d4cdd82016-08-17 21:39:45 +0200573if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200574 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200575 $log->error("Directory '$output' does not exist.");
576 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200577 };
578};
579
Akron63f20d42017-04-10 23:40:29 +0200580
581# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200582if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200583
Akron821db3d2017-04-06 21:19:31 +0200584 my @new_input = ();
585
586 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200587 foreach my $wild_card (@input) {
588
589 # Prefix with input root
590 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
591
592 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200593 };
594
Akron63f20d42017-04-10 23:40:29 +0200595 # Sort files by length
596 @input = sort { length($a) <=> length($b) } @new_input;
597
598 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200599};
600
601
Akron941c1a62016-02-23 17:41:41 +0100602# Process a single file
603unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100604 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000605
Akron941c1a62016-02-23 17:41:41 +0100606 BEGIN {
607 $main::TIME = Benchmark->new;
608 $main::LAST_STOP = Benchmark->new;
609 };
610
611 sub stop_time {
612 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200613 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100614 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200615 timestr(timediff($new, $main::LAST_STOP)) .
616 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
617 );
Akron941c1a62016-02-23 17:41:41 +0100618 $main::LAST_STOP = $new;
619 };
620
621 # Create and parse new document
622 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100623
Akron7d4cdd82016-08-17 21:39:45 +0200624 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200625 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100626
Akron11c80302016-03-18 19:44:43 +0100627 # Delete cache file
628 unlink($cache_file) if $cache_delete;
629
Akron5f51d422016-08-16 16:26:43 +0200630 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200631 exit;
Akron81500102017-04-07 20:45:44 +0200632};
633
Nils Diewald59094f22014-11-05 18:20:50 +0000634
Akrone10ad322016-02-27 10:54:26 +0100635# Extract XML files
Akron81500102017-04-07 20:45:44 +0200636if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100637
Akron7d4cdd82016-08-17 21:39:45 +0200638 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200639 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100640
Akron7d4cdd82016-08-17 21:39:45 +0200641 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100642 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200643 $log->error("Unzip is not installed or incompatible.");
644 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100645 };
646
Akronb0c88db2016-06-29 16:33:18 +0200647 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200648 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200649
Akron651cb8d2016-08-16 21:44:49 +0200650 my $prefix = 1;
651
Akron03b24db2016-08-16 20:54:32 +0200652 # No sigles given
653 unless (@sigle) {
654
655 # Get files
656 foreach ($archive->list_texts) {
657
658 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200659 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200660
661 # TODO: Make this OS independent
662 push @sigle, join '/', $corpus, $doc, $text;
663 };
Akron20807582016-10-26 17:11:34 +0200664 }
665
666 # Check sigle for doc sigles
667 else {
668 my @new_sigle;
669
670 my $prefix_check = 0;
671
672 # Iterate over all sigle
673 foreach (@sigle) {
674
675 # Sigle is a doc sigle
676 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200677
Akron60a8caa2017-02-17 21:51:27 +0100678 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200679 # Check if a prefix is needed
680 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100681
682 if ($prefix = $archive->check_prefix) {
683 print " with prefix ...";
684 };
Akron20807582016-10-26 17:11:34 +0200685 $prefix_check = 1;
686 };
687
Akron60a8caa2017-02-17 21:51:27 +0100688 print "\n";
689
Akron20807582016-10-26 17:11:34 +0200690 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200691 my $path = ($prefix ? './' : '') . $_;
692
693 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200694 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200695 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200696 ) ? '' : 'not '
697 );
698 print "extracted.\n";
699 }
Akron60a8caa2017-02-17 21:51:27 +0100700
701 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200702 else {
703 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100704
705 unless ($prefix_check) {
706
707 if ($prefix = $archive->check_prefix) {
708 print " with prefix ...";
709 };
710 $prefix_check = 1;
711 };
Akron20807582016-10-26 17:11:34 +0200712 };
713 };
714 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200715 };
716
Akrone10ad322016-02-27 10:54:26 +0100717 # Iterate over all given sigles and extract
718 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100719
Akron2812ba22016-10-28 21:55:59 +0200720 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200721
Akron03b24db2016-08-16 20:54:32 +0200722 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200723 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100724
Akron20807582016-10-26 17:11:34 +0200725 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200726 ($prefix ? './' : '') . $_, $output
727 ) ? '' : 'not '
728 );
Akrone10ad322016-02-27 10:54:26 +0100729 print "extracted.\n";
730 };
Akronb0c88db2016-06-29 16:33:18 +0200731 }
Akron7d4cdd82016-08-17 21:39:45 +0200732
733 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200734 else {
735 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200736 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100737 };
738}
739
Akron81500102017-04-07 20:45:44 +0200740
Akron941c1a62016-02-23 17:41:41 +0100741# Process an archive
742elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000743
Akron81500102017-04-07 20:45:44 +0200744 my $archive_output;
745
746 # First extract, then archive
747 if (defined $extract_dir) {
748
749 # Create new archive object
750 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
751
752 # Check zip capabilities
753 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200754 $log->error("Unzip is not installed or incompatible.");
755 exit 1;
Akron81500102017-04-07 20:45:44 +0200756 };
757
758 # Add further annotation archived
759 $archive->attach($_) foreach @input[1..$#input];
760
761 # Create a temporary directory
762 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200763 $extract_dir = tempdir(CLEANUP => 0);
764 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200765 };
766
Akron63f20d42017-04-10 23:40:29 +0200767 # Add some random extra to avoid clashes with multiple archives
768 $extract_dir = catdir($extract_dir, random_string('cccccc'));
769
770 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200771 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200772 @input = ($extract_dir);
773 }
774 else {
775 $log->error('Unable to extract from primary archive ' . $input[0] .
776 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200777 exit 1;
Akron81500102017-04-07 20:45:44 +0200778 };
779 }
780
781 # Can't create archive object
782 else {
783 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200784 exit 1;
Akron81500102017-04-07 20:45:44 +0200785 };
786 };
787
Akrone1dbc382016-07-08 22:24:52 +0200788 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100789
Akron7d4cdd82016-08-17 21:39:45 +0200790 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100791 my $pool = Parallel::ForkManager->new($jobs);
792
Akron7d4cdd82016-08-17 21:39:45 +0200793 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100794 my $iter = 1; # Current text in process
795
Akronda3097e2017-04-23 19:53:57 +0200796 my $tar_archive;
797 my $output_dir = $output;
798 my $tar_fh;
799
800 # Initialize tar archive
801 if ($to_tar) {
802 $tar_archive = Archive::Tar::Builder->new(
803 ignore_errors => 1
804 );
805
806 # Set output name
807 my $tar_file = $output;
808 unless ($tar_file =~ /\.tar$/) {
809 $tar_file .= '.tar';
810 };
811
812 # Initiate the tar file
813 print "Writing to file $tar_file\n";
814 $tar_fh = IO::File->new($tar_file, 'w');
815 $tar_fh->binmode(1);
816
817 # Set handle
818 $tar_archive->set_handle($tar_fh);
819
820 # Output to temporary directory
821 $output_dir = File::Temp->newdir;
822 };
823
Akron941c1a62016-02-23 17:41:41 +0100824 # Report on fork message
825 $pool->run_on_finish (
826 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200827 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100828 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200829
Akron08385f62016-03-22 20:37:04 +0100830 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200831 ($iter++) . "/$count]" .
832 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200833 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200834
835 if (!$code && $to_tar && $data->[2]) {
836 my $filename = $data->[2];
837
838 # Lock filehandle
839 if (flock($tar_fh, LOCK_EX)) {
840
Akron9a062ce2017-07-04 19:12:05 +0200841 my $clean_file = fileparse($filename);
842
Akronda3097e2017-04-23 19:53:57 +0200843 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200844 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200845 unlink $filename;
846
847 # Unlock filehandle
848 flock($tar_fh, LOCK_UN);
849 }
850 else {
851 $log->warn("Unable to add $filename to archive");
852 };
853 };
854
Akron4c0cf312016-10-15 16:42:09 +0200855 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100856 }
857 );
858
859 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200860 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100861 print "Reading data ...\n";
862
Akron7d4cdd82016-08-17 21:39:45 +0200863 # unless (Cache::FastMmap->new(
864 # share_file => $cache_file,
865 # cache_size => $cache_size,
866 # init_file => $cache_init
867 # )) {
868 # print "Unable to intialize cache '$cache_file'\n\n";
869 # exit(1);
870 # };
Akron11c80302016-03-18 19:44:43 +0100871
Akron486f9ab2017-04-22 23:25:19 +0200872
Akron941c1a62016-02-23 17:41:41 +0100873 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100874 if (-d $input[0]) {
875 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100876 my @dirs;
877 my $dir;
878
Akron7d4cdd82016-08-17 21:39:45 +0200879 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100880 while (1) {
881 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200882 push @dirs, $dir;
883 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100884 };
885 last unless $it->next;
886 };
887
888 print "Start processing ...\n";
889 $t = Benchmark->new;
890 $count = scalar @dirs;
891
892 DIRECTORY_LOOP:
893 for (my $i = 0; $i < $count; $i++) {
894
Akrone1dbc382016-07-08 22:24:52 +0200895 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200896 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200897 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200898 );
Akron941c1a62016-02-23 17:41:41 +0100899
900 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200901 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200902
Akron13d56622016-10-31 14:54:49 +0100903 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200904 $pool->finish(
905 0,
Akronda3097e2017-04-23 19:53:57 +0200906 [
907 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
908 undef,
909 $filename
910 ]
Akron486f9ab2017-04-22 23:25:19 +0200911 );
Akron3ec48972016-08-17 23:24:52 +0200912 }
913 else {
Akron4c0cf312016-10-15 16:42:09 +0200914 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200915 };
Akron941c1a62016-02-23 17:41:41 +0100916 };
917 }
918
919 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200920 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200921
Akron941c1a62016-02-23 17:41:41 +0100922 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200923 $log->error("Unzip is not installed or incompatible.");
924 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100925 };
926
Akron08385f62016-03-22 20:37:04 +0100927 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200928 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100929
Akron941c1a62016-02-23 17:41:41 +0100930 print "Start processing ...\n";
931 $t = Benchmark->new;
932 my @dirs = $archive->list_texts;
933 $count = scalar @dirs;
934
935 ARCHIVE_LOOP:
936 for (my $i = 0; $i < $count; $i++) {
937
938 # Split path information
939 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
940
Akrone1dbc382016-07-08 22:24:52 +0200941 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200942 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200943 get_file_name(
944 catfile($corpus, $doc, $text)
945 . '.json' . ($gzip ? '.gz' : '')
946 )
Akrone1dbc382016-07-08 22:24:52 +0200947 );
Akron941c1a62016-02-23 17:41:41 +0100948
949 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200950 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100951
Akron4c0cf312016-10-15 16:42:09 +0200952 # Create temporary file
953 $temp = File::Temp->newdir;
954
Akronbdf434a2016-10-24 17:42:07 +0200955 # TODO: Check if $filename exist at the beginning,
956 # because extraction can be horrible slow!
957
Akron941c1a62016-02-23 17:41:41 +0100958 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200959 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100960
Akron7d4cdd82016-08-17 21:39:45 +0200961 # Create corpus directory
962 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100963
Akron7d4cdd82016-08-17 21:39:45 +0200964 # Temporary directory
965 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100966
Akron7d4cdd82016-08-17 21:39:45 +0200967 # Write file
Akron13d56622016-10-31 14:54:49 +0100968 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200969
Akron4c0cf312016-10-15 16:42:09 +0200970 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100971 $pool->finish(
972 0,
Akronda3097e2017-04-23 19:53:57 +0200973 [
974 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
975 $temp,
976 $filename
977 ]
Akron13d56622016-10-31 14:54:49 +0100978 );
979 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200980 }
981 else {
Akron4c0cf312016-10-15 16:42:09 +0200982 # Delete temporary file
983 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200984 };
Akron941c1a62016-02-23 17:41:41 +0100985 }
Akron7d4cdd82016-08-17 21:39:45 +0200986
987 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100988 else {
Akron4c0cf312016-10-15 16:42:09 +0200989 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100990 };
991 };
992 }
993
994 else {
995 print "Input is neither a directory nor an archive.\n\n";
996 };
997
998 $pool->wait_all_children;
999
Akron11c80302016-03-18 19:44:43 +01001000 # Delete cache file
1001 unlink($cache_file) if $cache_delete;
1002
Akronda3097e2017-04-23 19:53:57 +02001003 # Close tar filehandle
1004 if ($to_tar && $tar_fh) {
1005 $tar_archive->finish;
1006 $tar_fh->close;
1007 print "Wrote to tar archive.\n";
1008 };
1009
Akron63f20d42017-04-10 23:40:29 +02001010 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001011 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001012};
Akron941c1a62016-02-23 17:41:41 +01001013
Nils Diewald2db9ad02013-10-29 19:26:43 +00001014
Akron63f20d42017-04-10 23:40:29 +02001015# Cleanup temporary extraction directory
1016if ($extract_dir) {
1017 my $objects = remove_tree($extract_dir, { safe => 1 });
1018 print "Removed directory $extract_dir with $objects objects.\n";
1019};
1020
1021
1022print "\n";
1023
Nils Diewald2db9ad02013-10-29 19:26:43 +00001024__END__
Akron941c1a62016-02-23 17:41:41 +01001025
1026=pod
1027
1028=encoding utf8
1029
1030=head1 NAME
1031
Akronf7ad89e2016-03-16 18:22:47 +01001032korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001033
1034
1035=head1 SYNOPSIS
1036
Akrona76d8352016-10-27 16:27:32 +02001037 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001038
Akron2fd402b2016-10-27 21:26:48 +02001039
Akron941c1a62016-02-23 17:41:41 +01001040=head1 DESCRIPTION
1041
1042L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1043compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001044The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001045
1046
1047=head1 INSTALLATION
1048
1049The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1050
Akronaf386982016-10-12 00:33:25 +02001051 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001052
Akronc13a1702016-03-15 19:33:14 +01001053In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001054be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001055Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001056In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001057
1058=head1 ARGUMENTS
1059
Akrona76d8352016-10-27 16:27:32 +02001060 $ korapxml2krill -z --input <directory> --output <filename>
1061
1062Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001063It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001064
Akron941c1a62016-02-23 17:41:41 +01001065=over 2
1066
1067=item B<archive>
1068
Akron081639e2017-04-21 19:01:39 +02001069 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001070
Akron2fd402b2016-10-27 21:26:48 +02001071Converts an archive of KorAP-XML documents. It expects a directory
1072(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001073
1074=item B<extract>
1075
Akrona76d8352016-10-27 16:27:32 +02001076 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1077
1078Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001079
Akron63f20d42017-04-10 23:40:29 +02001080=item B<serial>
1081
1082 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1083
1084Convert archives sequentially. The inputs are not merged but treated
1085as they are (so they may be premerged or globs).
1086the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001087are created based on the archive name. In case the C<--to-tar> flag is given,
1088the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001089
1090
Akron941c1a62016-02-23 17:41:41 +01001091=back
1092
1093
1094=head1 OPTIONS
1095
1096=over 2
1097
Akrona76d8352016-10-27 16:27:32 +02001098=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001099
Akrona76d8352016-10-27 16:27:32 +02001100Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001101
Akron7606afa2016-10-25 16:23:49 +02001102Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001103document, while C<archive> expects a KorAP-XML corpus folder or a zip
1104file to batch process multiple files.
1105C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001106
Akrona76d8352016-10-27 16:27:32 +02001107C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001108that the first archive listed contains all primary data files
1109and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001110
Akron7606afa2016-10-25 16:23:49 +02001111 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001112
Akron821db3d2017-04-06 21:19:31 +02001113Input may also be defined using BSD glob wildcards.
1114
1115 -i 'file/news*.zip'
1116
1117The extended input array will be sorted in length order, so the shortest
1118path needs to contain all primary data files and all meta data files.
1119
Akron0c3e3752016-06-28 15:55:53 +02001120(The directory structure follows the base directory format,
1121that may include a C<.> root folder.
1122In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001123need to be passed with a hash sign in front of the archive's name.
1124This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001125
Akron7606afa2016-10-25 16:23:49 +02001126To support zip files, a version of C<unzip> needs to be installed that is
1127compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001128
Akron7606afa2016-10-25 16:23:49 +02001129B<The root folder switch using the hash sign is experimental and
1130may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001131
Akron63f20d42017-04-10 23:40:29 +02001132=item B<--input-base|-ib> <directory>
1133
1134The base directory for inputs.
1135
1136
Akron941c1a62016-02-23 17:41:41 +01001137=item B<--output|-o> <directory|file>
1138
1139Output folder for archive processing or
1140document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001141writes to C<STDOUT> by default
1142(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001143
1144=item B<--overwrite|-w>
1145
1146Overwrite files that already exist.
1147
Akron3741f8b2016-12-21 19:55:21 +01001148=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001149
1150Define the default tokenization by specifying
1151the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001152of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001153
Akron3741f8b2016-12-21 19:55:21 +01001154
1155=item B<--base-sentences|-bs> <foundry>#<layer>
1156
1157Define the layer for base sentences.
1158If given, this will be used instead of using C<Base#Sentences>.
1159Currently C<DeReKo#Structure> is the only additional layer supported.
1160
1161 Defaults to unset.
1162
1163
1164=item B<--base-paragraphs|-bp> <foundry>#<layer>
1165
1166Define the layer for base paragraphs.
1167If given, this will be used instead of using C<Base#Paragraphs>.
1168Currently C<DeReKo#Structure> is the only additional layer supported.
1169
1170 Defaults to unset.
1171
1172
Akron41ac10b2017-02-08 22:47:25 +01001173=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1174
1175Define the layer for base pagebreaks.
1176Currently C<DeReKo#Structure> is the only layer supported.
1177
1178 Defaults to unset.
1179
1180
Akron941c1a62016-02-23 17:41:41 +01001181=item B<--skip|-s> <foundry>[#<layer>]
1182
Akronf7ad89e2016-03-16 18:22:47 +01001183Skip specific annotations by specifying the foundry
1184(and optionally the layer with a C<#>-prefix),
1185e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001186Can be set multiple times.
1187
Akronc13a1702016-03-15 19:33:14 +01001188=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001189
Akronf7ad89e2016-03-16 18:22:47 +01001190Convert specific annotations by specifying the foundry
1191(and optionally the layer with a C<#>-prefix),
1192e.g. C<Mate> or C<Mate#Morpho>.
1193Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001194
1195=item B<--primary|-p>
1196
Akronc13a1702016-03-15 19:33:14 +01001197Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001198Can be flagged using C<--no-primary> as well.
1199This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001200
1201=item B<--jobs|-j>
1202
1203Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001204for archive processing.
Akron11c80302016-03-18 19:44:43 +01001205Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001206
1207If C<sequential-extraction> is not set to false, this will
1208also apply to extraction.
1209
Akronc11f7982017-02-21 21:20:14 +01001210Pass -1, and the value will be set automatically to 5
1211times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001212This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001213
Akron9ec88872017-04-12 16:29:06 +02001214=item B<--sequential-extraction|-se>
1215
1216Flag to indicate, if the C<jobs> value also applies to extraction.
1217Some systems may have problems with extracting multiple archives
1218to the same folder at the same time.
1219Can be flagged using C<--no-sequential-extraction> as well.
1220Defaults to C<false>.
1221
Akron35db6e32016-03-17 22:42:22 +01001222=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001223
Akron35db6e32016-03-17 22:42:22 +01001224Define the metadata parser to use. Defaults to C<I5>.
1225Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1226This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001227
1228=item B<--pretty|-y>
1229
Akronc13a1702016-03-15 19:33:14 +01001230Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001231This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001232
1233=item B<--gzip|-z>
1234
Akronf7ad89e2016-03-16 18:22:47 +01001235Compress the output.
1236Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001237
Akron11c80302016-03-18 19:44:43 +01001238=item B<--cache|-c>
1239
1240File to mmap a cache (using L<Cache::FastMmap>).
1241Defaults to C<korapxml2krill.cache> in the calling directory.
1242
1243=item B<--cache-size|-cs>
1244
1245Size of the cache. Defaults to C<50m>.
1246
1247=item B<--cache-init|-ci>
1248
1249Initialize cache file.
1250Can be flagged using C<--no-cache-init> as well.
1251Defaults to C<true>.
1252
1253=item B<--cache-delete|-cd>
1254
1255Delete cache file after processing.
1256Can be flagged using C<--no-cache-delete> as well.
1257Defaults to C<true>.
1258
Akron636aa112017-04-07 18:48:56 +02001259=item B<--config|-cfg>
1260
1261Configure the parameters of your call in a file
1262of key-value pairs with whitespace separator
1263
1264 overwrite 1
1265 token DeReKo#Structure
1266 ...
1267
1268Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001269C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001270C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001271C<output>,
1272C<temp-extract>, C<sequential-extraction>,
1273C<base-sentences>, C<base-paragraphs>,
1274C<base-pagebreaks>,
1275C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001276(semicolon separated), C<anno> (semicolon separated).
1277
Akron81500102017-04-07 20:45:44 +02001278=item B<--temporary-extract|-te>
1279
1280Only valid for the C<archive> command.
1281
1282This will first extract all files into a
1283directory and then will archive.
1284If the directory is given as C<:temp:>,
1285a temporary directory is used.
1286This is especially useful to avoid
1287massive unzipping and potential
1288network latency.
Akron636aa112017-04-07 18:48:56 +02001289
Akrone10ad322016-02-27 10:54:26 +01001290=item B<--sigle|-sg>
1291
Akron20807582016-10-26 17:11:34 +02001292Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001293Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001294I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001295Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001296In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001297On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001298
Akron941c1a62016-02-23 17:41:41 +01001299=item B<--log|-l>
1300
1301The L<Log4perl> log level, defaults to C<ERROR>.
1302
1303=item B<--help|-h>
1304
1305Print this document.
1306
1307=item B<--version|-v>
1308
1309Print version information.
1310
1311=back
1312
Akronc13a1702016-03-15 19:33:14 +01001313=head1 ANNOTATION SUPPORT
1314
1315L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1316developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1317The base foundry with paragraphs, sentences, and the text element are mandatory for
1318L<Krill|https://github.com/KorAP/Krill>.
1319
Akron821db3d2017-04-06 21:19:31 +02001320 Base
1321 #Paragraphs
1322 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001323
Akron821db3d2017-04-06 21:19:31 +02001324 Connexor
1325 #Morpho
1326 #Phrase
1327 #Sentences
1328 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001329
Akron821db3d2017-04-06 21:19:31 +02001330 CoreNLP
1331 #Constituency
1332 #Morpho
1333 #NamedEntities
1334 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001335
Akronce125b62017-06-19 11:54:36 +02001336 CMC
1337 #Morpho
1338
Akron821db3d2017-04-06 21:19:31 +02001339 DeReKo
1340 #Structure
Akronc13a1702016-03-15 19:33:14 +01001341
Akron821db3d2017-04-06 21:19:31 +02001342 DRuKoLa
1343 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001344
Akron821db3d2017-04-06 21:19:31 +02001345 Glemm
1346 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001347
Akron821db3d2017-04-06 21:19:31 +02001348 Malt
1349 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001350
Akron821db3d2017-04-06 21:19:31 +02001351 MarMoT
1352 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001353
Akron821db3d2017-04-06 21:19:31 +02001354 Mate
1355 #Dependency
1356 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001357
Akron821db3d2017-04-06 21:19:31 +02001358 MDParser
1359 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001360
Akron821db3d2017-04-06 21:19:31 +02001361 OpenNLP
1362 #Morpho
1363 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001364
Akron821db3d2017-04-06 21:19:31 +02001365 Sgbr
1366 #Lemma
1367 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001368
Akron821db3d2017-04-06 21:19:31 +02001369 TreeTagger
1370 #Morpho
1371 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001372
Akron821db3d2017-04-06 21:19:31 +02001373 XIP
1374 #Constituency
1375 #Morpho
1376 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001377
Akronc13a1702016-03-15 19:33:14 +01001378
1379More importers are in preparation.
1380New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1381See the built-in annotation importers as examples.
1382
Akron941c1a62016-02-23 17:41:41 +01001383=head1 AVAILABILITY
1384
1385 https://github.com/KorAP/KorAP-XML-Krill
1386
1387
1388=head1 COPYRIGHT AND LICENSE
1389
Akron3ec0a1c2017-01-18 14:41:55 +01001390Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001391
Akron941c1a62016-02-23 17:41:41 +01001392Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001393
Akrona76d8352016-10-27 16:27:32 +02001394Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001395
1396L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1397Corpus Analysis Platform at the
1398L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1399member of the
1400L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1401
1402This program is free software published under the
1403L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1404
1405=cut