blob: 4ff4b5bd3fb399efe659b7b60c249c3b5d9dbed5 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron941c1a62016-02-23 17:41:41 +0100129# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100130
Akron4c679192018-01-16 17:41:49 +0100131our $LAST_CHANGE = '2018/01/16';
Akron941c1a62016-02-23 17:41:41 +0100132our $LOCAL = $FindBin::Bin;
133our $VERSION_MSG = <<"VERSION";
134Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
135VERSION
136
Akron63f20d42017-04-10 23:40:29 +0200137# Prototypes
138sub get_file_name_from_glob($);
139sub get_file_name($);
140
Akron941c1a62016-02-23 17:41:41 +0100141# Parse comand
142my $cmd;
143our @ARGV;
144if ($ARGV[0] && index($ARGV[0], '-') != 0) {
145 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100146};
Akron63f20d42017-04-10 23:40:29 +0200147my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100148
Akron5f51d422016-08-16 16:26:43 +0200149my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100150my $text;
Akrone10ad322016-02-27 10:54:26 +0100151
Akron941c1a62016-02-23 17:41:41 +0100152# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000153GetOptions(
Akron08385f62016-03-22 20:37:04 +0100154 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200155 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100156 'output|o=s' => \(my $output),
157 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100158 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200159 'token|t=s' => \(my $token_base),
160 'base-sentences|bs=s' => \(my $base_sentences),
161 'base-paragraphs|bp=s' => \(my $base_paragraphs),
162 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100163 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200164 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100165 'skip|s=s' => \@skip,
166 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200167 'cache|c=s' => \(my $cache_file),
168 'config|cfg=s' => \(my $cfg_file),
169 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200170 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100171 'primary|p!' => \(my $primary),
172 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200173 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200174 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200175 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200176 'cache-size|cs=s' => \(my $cache_size),
177 'cache-delete|cd!' => \(my $cache_delete),
178 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100179 'help|h' => sub {
180 pod2usage(
181 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200182 -verbose => 99,
183 -msg => $VERSION_MSG,
184 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100185 );
186 },
187 'version|v' => sub {
188 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200189 -verbose => 0,
190 -msg => $VERSION_MSG,
191 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100192 )
193 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000194);
195
Akron63f20d42017-04-10 23:40:29 +0200196
Akron636aa112017-04-07 18:48:56 +0200197# Load from configuration
198if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200199 my %config;
200
201 Config::Simple->import_from($cfg_file, \%config);
202
203 # Overwrite
204 if (!defined($overwrite) && defined $config{overwrite}) {
205 $overwrite = $config{overwrite};
206 };
207
208 # Gzip
209 if (!defined($gzip) && defined $config{gzip}) {
210 $gzip = $config{gzip};
211 };
212
213 # Jobs
214 if (!defined($jobs) && defined $config{jobs}) {
215 $jobs = $config{jobs};
216 };
217
Akron63f20d42017-04-10 23:40:29 +0200218 # Input root base directory
219 if (!defined($input_base) && defined $config{'input-base'}) {
220 $input_base = $config{'input-base'};
221 };
222
Akron81500102017-04-07 20:45:44 +0200223 # temporary-extract
224 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
225 $extract_dir = $config{'temporary-extract'};
226 };
227
Akron636aa112017-04-07 18:48:56 +0200228 # Token base
229 if (!defined($token_base) && defined $config{token}) {
230 $token_base = $config{token};
231 };
232
233 # Cache file
234 if (!defined($cache_file) && defined $config{cache}) {
235 $cache_file = $config{cache};
236 };
237
238 # Cache size
239 if (!defined($cache_size) && defined $config{'cache-size'}) {
240 $cache_size = $config{'cache-size'};
241 };
242
243 # Cache delete
244 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
245 $cache_delete = $config{'cache-delete'} ;
246 };
247
248 # Cache init
249 if (!(defined $cache_init) && defined $config{'cache-init'}) {
250 $cache_init = $config{'cache-init'} ;
251 };
252
Akron9ec88872017-04-12 16:29:06 +0200253 # Jobs for extraction
254 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
255 $sequential_extraction = $config{'sequential-extraction'} ;
256 };
257
Akron636aa112017-04-07 18:48:56 +0200258 # Meta
259 if (!(defined $meta) && defined $config{'meta'}) {
260 $meta = $config{'meta'} ;
261 };
262
263 # Output
264 if (!(defined $output) && defined $config{'output'}) {
265 $output = $config{'output'} ;
266 };
267
268 # Base-sentences
269 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
270 $base_sentences = $config{'base-sentences'} ;
271 };
272
273 # Base-paragraphs
274 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
275 $base_paragraphs = $config{'base-paragraphs'} ;
276 };
277
278 # Base-pagebreaks
279 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
280 $base_pagebreaks = $config{'base-pagebreaks'} ;
281 };
282
Akron081639e2017-04-21 19:01:39 +0200283 # Write to tar
284 if (!(defined $to_tar) && defined $config{'to-tar'}) {
285 $to_tar = $config{'to-tar'} ;
286 };
287
Akron636aa112017-04-07 18:48:56 +0200288 # Log
289 if (!(defined $log_level) && defined $config{'log'}) {
290 $log_level = $config{'log'} ;
291 };
292
293 # Skip
294 if (!scalar(@skip) && defined $config{'skip'}) {
295 @skip = split /\s*;\s*/, $config{'skip'} ;
296 };
297
298 # Sigle
299 if (!scalar(@sigle) && defined $config{'sigle'}) {
300 @sigle = split /\s*;\s*/, $config{'sigle'} ;
301 };
302
303 # Anno
304 if (!scalar(@anno) && defined $config{'anno'}) {
305 @anno = split /\s*;\s*/, $config{'anno'} ;
306 };
307};
308
Akron63f20d42017-04-10 23:40:29 +0200309
Akron636aa112017-04-07 18:48:56 +0200310# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200311$token_base //= 'OpenNLP#tokens';
312$cache_file //= 'korapxml2krill.cache';
313$cache_size //= '50m';
314$jobs //= 0;
315$cache_delete //= 1;
316$cache_init //= 1;
317$sequential_extraction //= 0;
318$log_level //= 'ERROR';
319$base_sentences //= '';
320$base_paragraphs //= '';
321$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200322
Akron821db3d2017-04-06 21:19:31 +0200323$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100324$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100325$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100326
Akron63f20d42017-04-10 23:40:29 +0200327
328# Initialize log4perl object
329Log::Log4perl->init({
330 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
331 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
332 'log4perl.appender.STDERR.layout' => 'PatternLayout',
333 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
334});
335
336my $log = Log::Log4perl->get_logger('main');
337
338
339print "Reading config from $cfg_file\n" if $cfg_file;
340
341
Akron941c1a62016-02-23 17:41:41 +0100342my %ERROR_HASH = (
343 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200344 -verbose => 99,
345 -msg => $VERSION_MSG,
346 -output => '-',
347 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100348);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349
Akron941c1a62016-02-23 17:41:41 +0100350# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100351pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352
Akrone1dbc382016-07-08 22:24:52 +0200353# Gzip has no effect, if no output is given
354pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000355
Akronc11f7982017-02-21 21:20:14 +0100356
Akron636aa112017-04-07 18:48:56 +0200357if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100358 state $cores = Sys::Info->new->device('CPU')->count;
359 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200360 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100361};
362
Akron821db3d2017-04-06 21:19:31 +0200363
Akron63f20d42017-04-10 23:40:29 +0200364# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200365if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200366
Akron486f9ab2017-04-22 23:25:19 +0200367 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200368 $log->error("Directory '$output' does not exist.");
369 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200370 };
371
372 # Remove all inputs
373 my $remove_next = 0;
374 @keep_argv = @{c(@keep_argv)->grep(
375 sub {
376 # Input flag
377 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
378 $remove_next = 1;
379 return 0;
380 }
381
382 # input value
383 elsif ($remove_next) {
384 $remove_next = 0;
385 return 0;
386 };
387
388 # Pass parameter
389 return 1;
390 }
391 )->to_array};
392
393
394 # Iterate over all inputs
395 foreach (@input) {
396
Akron081639e2017-04-21 19:01:39 +0200397 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200398 my $new_out = catdir($output, get_file_name_from_glob($_));
399
Akron486f9ab2017-04-22 23:25:19 +0200400 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200401 unless ($to_tar) {
402 if (make_path($new_out) == 0 && !-d $new_out) {
403 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200404 exit 1;
Akron081639e2017-04-21 19:01:39 +0200405 };
Akron63f20d42017-04-10 23:40:29 +0200406 };
407
408 # Create archive command
409 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
410 print "Start serial processing of $_ to $new_out\n";
411
412 # Start archiving
413 system @archive_cmd;
414 };
415
Akron3abc03e2017-06-29 16:23:35 +0200416 exit;
Akron63f20d42017-04-10 23:40:29 +0200417};
418
Akrone1dbc382016-07-08 22:24:52 +0200419my %skip;
420$skip{lc($_)} = 1 foreach @skip;
421
422my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100423push(@layers, ['Base', 'Sentences']) unless $base_sentences;
424push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200425
426# Connexor
427push(@layers, ['Connexor', 'Morpho']);
428push(@layers, ['Connexor', 'Syntax']);
429push(@layers, ['Connexor', 'Phrase']);
430push(@layers, ['Connexor', 'Sentences']);
431
432# CoreNLP
433push(@layers, ['CoreNLP', 'NamedEntities']);
434push(@layers, ['CoreNLP', 'Sentences']);
435push(@layers, ['CoreNLP', 'Morpho']);
436push(@layers, ['CoreNLP', 'Constituency']);
437
Akronce125b62017-06-19 11:54:36 +0200438# CMC
439push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100440
Akrone1dbc382016-07-08 22:24:52 +0200441# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100442my @dereko_attr = ();
443if ($base_sentences eq 'dereko#structure') {
444 push @dereko_attr, 'sentences';
445};
446if ($base_paragraphs eq 'dereko#structure') {
447 push @dereko_attr, 'paragraphs';
448};
Akron636bd9c2017-02-09 17:13:00 +0100449
Akron41ac10b2017-02-08 22:47:25 +0100450if ($base_pagebreaks eq 'dereko#structure') {
451 push @dereko_attr, 'pagebreaks';
452};
453
454if ($dereko_attr[0]) {
455 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100456}
457else {
458 push(@layers, ['DeReKo', 'Structure']);
459};
Akrone1dbc382016-07-08 22:24:52 +0200460
461# Glemm
462push(@layers, ['Glemm', 'Morpho']);
463
Akron4c679192018-01-16 17:41:49 +0100464# LWC
465push(@layers, ['LWC', 'Dependency']);
466
Akrone1dbc382016-07-08 22:24:52 +0200467# Malt
468push(@layers, ['Malt', 'Dependency']);
469
470# MDParser
471push(@layers, ['MDParser', 'Dependency']);
472
473# Mate
474push(@layers, ['Mate', 'Morpho']);
475push(@layers, ['Mate', 'Dependency']);
476
477# OpenNLP
478push(@layers, ['OpenNLP', 'Morpho']);
479push(@layers, ['OpenNLP', 'Sentences']);
480
481# Schreibgebrauch
482push(@layers, ['Sgbr', 'Lemma']);
483push(@layers, ['Sgbr', 'Morpho']);
484
485# TreeTagger
486push(@layers, ['TreeTagger', 'Morpho']);
487push(@layers, ['TreeTagger', 'Sentences']);
488
489# XIP
490push(@layers, ['XIP', 'Morpho']);
491push(@layers, ['XIP', 'Constituency']);
492push(@layers, ['XIP', 'Sentences']);
493push(@layers, ['XIP', 'Dependency']);
494
Akron4fa37c32017-01-20 14:43:10 +0100495# DRuKoLa
496push(@layers, ['DRuKoLa', 'Morpho']);
497
Akron3bd942f2017-02-20 20:09:14 +0100498# Marmot
499push(@layers, ['MarMoT', 'Morpho']);
500
Akron4fa37c32017-01-20 14:43:10 +0100501
Akrone1dbc382016-07-08 22:24:52 +0200502# Check filters
503my @filtered_anno;
504if ($skip{'#all'}) {
505 foreach (@anno) {
506 push @filtered_anno, [ split('#', $_) ];
507 };
508}
509
510# Add all annotations that are not skipped
511else {
512 # Add to index file - respect skipping
513 foreach my $info (@layers) {
514 # Skip if Foundry or Foundry#Layer should be skipped
515 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
516 push @filtered_anno, $info;
517 };
518 };
519};
520
521# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200522my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
523
524# Remove file extension
525$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200526
527# TODO: This should not be initialized for batch
528my $cache = Cache::FastMmap->new(
529 share_file => $cache_file,
530 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200531 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200532);
533
Akron03b24db2016-08-16 20:54:32 +0200534# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200535my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200536 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200537 meta_type => $meta,
538 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200539 foundry => $token_base_foundry,
540 layer => $token_base_layer,
541 gzip => $gzip,
542 log => $log,
543 primary => $primary,
544 pretty => $pretty,
545 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200546);
547
Akron941c1a62016-02-23 17:41:41 +0100548# Get file name based on path information
549sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100550 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200551 if (-d $i) {
552 $i =~ s![^\/]+$!!;
553 };
Akron941c1a62016-02-23 17:41:41 +0100554 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200555
556 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200557 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100558 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100559 $file =~ tr/\//-/;
560 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200561 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100562 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000563};
564
Akron63f20d42017-04-10 23:40:29 +0200565
566sub get_file_name_from_glob ($) {
567 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200568 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200569 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
570 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
571 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
572 $glob =~ s/^-//; # Clean beginning
573 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200574 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200575 return $glob;
576};
577
578
Akrone10ad322016-02-27 10:54:26 +0100579# Convert sigle to path construct
580s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
581
Akron7d4cdd82016-08-17 21:39:45 +0200582if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200583 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200584 $log->error("Directory '$output' does not exist.");
585 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200586 };
587};
588
Akron63f20d42017-04-10 23:40:29 +0200589
590# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200591if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200592
Akron821db3d2017-04-06 21:19:31 +0200593 my @new_input = ();
594
595 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200596 foreach my $wild_card (@input) {
597
598 # Prefix with input root
599 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
600
601 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200602 };
603
Akron63f20d42017-04-10 23:40:29 +0200604 # Sort files by length
605 @input = sort { length($a) <=> length($b) } @new_input;
606
607 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200608};
609
610
Akron941c1a62016-02-23 17:41:41 +0100611# Process a single file
612unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100613 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000614
Akron941c1a62016-02-23 17:41:41 +0100615 BEGIN {
616 $main::TIME = Benchmark->new;
617 $main::LAST_STOP = Benchmark->new;
618 };
619
620 sub stop_time {
621 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200622 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100623 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200624 timestr(timediff($new, $main::LAST_STOP)) .
625 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
626 );
Akron941c1a62016-02-23 17:41:41 +0100627 $main::LAST_STOP = $new;
628 };
629
630 # Create and parse new document
631 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100632
Akron7d4cdd82016-08-17 21:39:45 +0200633 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200634 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100635
Akron11c80302016-03-18 19:44:43 +0100636 # Delete cache file
637 unlink($cache_file) if $cache_delete;
638
Akron5f51d422016-08-16 16:26:43 +0200639 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200640 exit;
Akron81500102017-04-07 20:45:44 +0200641};
642
Nils Diewald59094f22014-11-05 18:20:50 +0000643
Akrone10ad322016-02-27 10:54:26 +0100644# Extract XML files
Akron81500102017-04-07 20:45:44 +0200645if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100646
Akrond5643ad2017-07-04 20:27:13 +0200647 # Output is required
648 pod2usage(%ERROR_HASH) unless $output;
649
Akron7d4cdd82016-08-17 21:39:45 +0200650 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200651 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100652
Akron7d4cdd82016-08-17 21:39:45 +0200653 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100654 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200655 $log->error("Unzip is not installed or incompatible.");
656 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100657 };
658
Akronb0c88db2016-06-29 16:33:18 +0200659 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200660 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200661
Akron651cb8d2016-08-16 21:44:49 +0200662 my $prefix = 1;
663
Akron03b24db2016-08-16 20:54:32 +0200664 # No sigles given
665 unless (@sigle) {
666
667 # Get files
668 foreach ($archive->list_texts) {
669
670 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200671 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200672
673 # TODO: Make this OS independent
674 push @sigle, join '/', $corpus, $doc, $text;
675 };
Akron20807582016-10-26 17:11:34 +0200676 }
677
678 # Check sigle for doc sigles
679 else {
680 my @new_sigle;
681
682 my $prefix_check = 0;
683
684 # Iterate over all sigle
685 foreach (@sigle) {
686
687 # Sigle is a doc sigle
688 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200689
Akron60a8caa2017-02-17 21:51:27 +0100690 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200691 # Check if a prefix is needed
692 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100693
694 if ($prefix = $archive->check_prefix) {
695 print " with prefix ...";
696 };
Akron20807582016-10-26 17:11:34 +0200697 $prefix_check = 1;
698 };
699
Akron60a8caa2017-02-17 21:51:27 +0100700 print "\n";
701
Akron20807582016-10-26 17:11:34 +0200702 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200703 my $path = ($prefix ? './' : '') . $_;
704
705 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200706 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200707 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200708 ) ? '' : 'not '
709 );
710 print "extracted.\n";
711 }
Akron60a8caa2017-02-17 21:51:27 +0100712
713 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200714 else {
715 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100716
717 unless ($prefix_check) {
718
719 if ($prefix = $archive->check_prefix) {
720 print " with prefix ...";
721 };
722 $prefix_check = 1;
723 };
Akron20807582016-10-26 17:11:34 +0200724 };
725 };
726 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200727 };
728
Akrone10ad322016-02-27 10:54:26 +0100729 # Iterate over all given sigles and extract
730 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100731
Akron2812ba22016-10-28 21:55:59 +0200732 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200733
Akron03b24db2016-08-16 20:54:32 +0200734 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200735 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100736
Akron20807582016-10-26 17:11:34 +0200737 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200738 ($prefix ? './' : '') . $_, $output
739 ) ? '' : 'not '
740 );
Akrone10ad322016-02-27 10:54:26 +0100741 print "extracted.\n";
742 };
Akronb0c88db2016-06-29 16:33:18 +0200743 }
Akron7d4cdd82016-08-17 21:39:45 +0200744
745 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200746 else {
747 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200748 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100749 };
750}
751
Akron81500102017-04-07 20:45:44 +0200752
Akron941c1a62016-02-23 17:41:41 +0100753# Process an archive
754elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000755
Akron81500102017-04-07 20:45:44 +0200756 my $archive_output;
757
758 # First extract, then archive
759 if (defined $extract_dir) {
760
761 # Create new archive object
762 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
763
764 # Check zip capabilities
765 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200766 $log->error("Unzip is not installed or incompatible.");
767 exit 1;
Akron81500102017-04-07 20:45:44 +0200768 };
769
770 # Add further annotation archived
771 $archive->attach($_) foreach @input[1..$#input];
772
773 # Create a temporary directory
774 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200775 $extract_dir = tempdir(CLEANUP => 0);
776 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200777 };
778
Akron63f20d42017-04-10 23:40:29 +0200779 # Add some random extra to avoid clashes with multiple archives
780 $extract_dir = catdir($extract_dir, random_string('cccccc'));
781
782 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200783 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200784 @input = ($extract_dir);
785 }
786 else {
787 $log->error('Unable to extract from primary archive ' . $input[0] .
788 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200789 exit 1;
Akron81500102017-04-07 20:45:44 +0200790 };
791 }
792
793 # Can't create archive object
794 else {
795 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200796 exit 1;
Akron81500102017-04-07 20:45:44 +0200797 };
798 };
799
Akrone1dbc382016-07-08 22:24:52 +0200800 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100801
Akron7d4cdd82016-08-17 21:39:45 +0200802 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100803 my $pool = Parallel::ForkManager->new($jobs);
804
Akron7d4cdd82016-08-17 21:39:45 +0200805 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100806 my $iter = 1; # Current text in process
807
Akronda3097e2017-04-23 19:53:57 +0200808 my $tar_archive;
809 my $output_dir = $output;
810 my $tar_fh;
811
812 # Initialize tar archive
813 if ($to_tar) {
814 $tar_archive = Archive::Tar::Builder->new(
815 ignore_errors => 1
816 );
817
818 # Set output name
819 my $tar_file = $output;
820 unless ($tar_file =~ /\.tar$/) {
821 $tar_file .= '.tar';
822 };
823
824 # Initiate the tar file
825 print "Writing to file $tar_file\n";
826 $tar_fh = IO::File->new($tar_file, 'w');
827 $tar_fh->binmode(1);
828
829 # Set handle
830 $tar_archive->set_handle($tar_fh);
831
832 # Output to temporary directory
833 $output_dir = File::Temp->newdir;
834 };
835
Akron941c1a62016-02-23 17:41:41 +0100836 # Report on fork message
837 $pool->run_on_finish (
838 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200839 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100840 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200841
Akron08385f62016-03-22 20:37:04 +0100842 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200843 ($iter++) . "/$count]" .
844 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200845 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200846
847 if (!$code && $to_tar && $data->[2]) {
848 my $filename = $data->[2];
849
850 # Lock filehandle
851 if (flock($tar_fh, LOCK_EX)) {
852
Akron9a062ce2017-07-04 19:12:05 +0200853 my $clean_file = fileparse($filename);
854
Akronda3097e2017-04-23 19:53:57 +0200855 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200856 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200857 unlink $filename;
858
859 # Unlock filehandle
860 flock($tar_fh, LOCK_UN);
861 }
862 else {
863 $log->warn("Unable to add $filename to archive");
864 };
865 };
866
Akron4c0cf312016-10-15 16:42:09 +0200867 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100868 }
869 );
870
871 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200872 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100873 print "Reading data ...\n";
874
Akron7d4cdd82016-08-17 21:39:45 +0200875 # unless (Cache::FastMmap->new(
876 # share_file => $cache_file,
877 # cache_size => $cache_size,
878 # init_file => $cache_init
879 # )) {
880 # print "Unable to intialize cache '$cache_file'\n\n";
881 # exit(1);
882 # };
Akron11c80302016-03-18 19:44:43 +0100883
Akron486f9ab2017-04-22 23:25:19 +0200884
Akron941c1a62016-02-23 17:41:41 +0100885 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100886 if (-d $input[0]) {
887 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100888 my @dirs;
889 my $dir;
890
Akron7d4cdd82016-08-17 21:39:45 +0200891 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100892 while (1) {
893 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200894 push @dirs, $dir;
895 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100896 };
897 last unless $it->next;
898 };
899
900 print "Start processing ...\n";
901 $t = Benchmark->new;
902 $count = scalar @dirs;
903
904 DIRECTORY_LOOP:
905 for (my $i = 0; $i < $count; $i++) {
906
Akrone1dbc382016-07-08 22:24:52 +0200907 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200908 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200909 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200910 );
Akron941c1a62016-02-23 17:41:41 +0100911
912 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200913 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200914
Akron13d56622016-10-31 14:54:49 +0100915 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200916 $pool->finish(
917 0,
Akronda3097e2017-04-23 19:53:57 +0200918 [
919 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
920 undef,
921 $filename
922 ]
Akron486f9ab2017-04-22 23:25:19 +0200923 );
Akron3ec48972016-08-17 23:24:52 +0200924 }
925 else {
Akron4c0cf312016-10-15 16:42:09 +0200926 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200927 };
Akron941c1a62016-02-23 17:41:41 +0100928 };
929 }
930
931 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200932 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200933
Akron941c1a62016-02-23 17:41:41 +0100934 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200935 $log->error("Unzip is not installed or incompatible.");
936 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100937 };
938
Akron08385f62016-03-22 20:37:04 +0100939 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200940 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100941
Akron941c1a62016-02-23 17:41:41 +0100942 print "Start processing ...\n";
943 $t = Benchmark->new;
944 my @dirs = $archive->list_texts;
945 $count = scalar @dirs;
946
947 ARCHIVE_LOOP:
948 for (my $i = 0; $i < $count; $i++) {
949
950 # Split path information
951 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
952
Akrone1dbc382016-07-08 22:24:52 +0200953 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200954 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200955 get_file_name(
956 catfile($corpus, $doc, $text)
957 . '.json' . ($gzip ? '.gz' : '')
958 )
Akrone1dbc382016-07-08 22:24:52 +0200959 );
Akron941c1a62016-02-23 17:41:41 +0100960
961 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200962 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100963
Akron4c0cf312016-10-15 16:42:09 +0200964 # Create temporary file
965 $temp = File::Temp->newdir;
966
Akronbdf434a2016-10-24 17:42:07 +0200967 # TODO: Check if $filename exist at the beginning,
968 # because extraction can be horrible slow!
969
Akron941c1a62016-02-23 17:41:41 +0100970 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200971 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100972
Akron7d4cdd82016-08-17 21:39:45 +0200973 # Create corpus directory
974 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100975
Akron7d4cdd82016-08-17 21:39:45 +0200976 # Temporary directory
977 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100978
Akron7d4cdd82016-08-17 21:39:45 +0200979 # Write file
Akron13d56622016-10-31 14:54:49 +0100980 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200981
Akron4c0cf312016-10-15 16:42:09 +0200982 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100983 $pool->finish(
984 0,
Akronda3097e2017-04-23 19:53:57 +0200985 [
986 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
987 $temp,
988 $filename
989 ]
Akron13d56622016-10-31 14:54:49 +0100990 );
991 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200992 }
993 else {
Akron4c0cf312016-10-15 16:42:09 +0200994 # Delete temporary file
995 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200996 };
Akron941c1a62016-02-23 17:41:41 +0100997 }
Akron7d4cdd82016-08-17 21:39:45 +0200998
999 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001000 else {
Akron4c0cf312016-10-15 16:42:09 +02001001 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001002 };
1003 };
1004 }
1005
1006 else {
1007 print "Input is neither a directory nor an archive.\n\n";
1008 };
1009
1010 $pool->wait_all_children;
1011
Akron11c80302016-03-18 19:44:43 +01001012 # Delete cache file
1013 unlink($cache_file) if $cache_delete;
1014
Akronda3097e2017-04-23 19:53:57 +02001015 # Close tar filehandle
1016 if ($to_tar && $tar_fh) {
1017 $tar_archive->finish;
1018 $tar_fh->close;
1019 print "Wrote to tar archive.\n";
1020 };
1021
Akron63f20d42017-04-10 23:40:29 +02001022 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001023 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001024};
Akron941c1a62016-02-23 17:41:41 +01001025
Nils Diewald2db9ad02013-10-29 19:26:43 +00001026
Akron63f20d42017-04-10 23:40:29 +02001027# Cleanup temporary extraction directory
1028if ($extract_dir) {
1029 my $objects = remove_tree($extract_dir, { safe => 1 });
1030 print "Removed directory $extract_dir with $objects objects.\n";
1031};
1032
1033
1034print "\n";
1035
Nils Diewald2db9ad02013-10-29 19:26:43 +00001036__END__
Akron941c1a62016-02-23 17:41:41 +01001037
1038=pod
1039
1040=encoding utf8
1041
1042=head1 NAME
1043
Akronf7ad89e2016-03-16 18:22:47 +01001044korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001045
1046
1047=head1 SYNOPSIS
1048
Akrona76d8352016-10-27 16:27:32 +02001049 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001050
Akron2fd402b2016-10-27 21:26:48 +02001051
Akron941c1a62016-02-23 17:41:41 +01001052=head1 DESCRIPTION
1053
1054L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1055compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001056The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001057
1058
1059=head1 INSTALLATION
1060
1061The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1062
Akronaf386982016-10-12 00:33:25 +02001063 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001064
Akronc13a1702016-03-15 19:33:14 +01001065In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001066be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001067Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001068In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001069
1070=head1 ARGUMENTS
1071
Akrona76d8352016-10-27 16:27:32 +02001072 $ korapxml2krill -z --input <directory> --output <filename>
1073
1074Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001075It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001076
Akron941c1a62016-02-23 17:41:41 +01001077=over 2
1078
1079=item B<archive>
1080
Akron081639e2017-04-21 19:01:39 +02001081 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001082
Akron2fd402b2016-10-27 21:26:48 +02001083Converts an archive of KorAP-XML documents. It expects a directory
1084(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001085
1086=item B<extract>
1087
Akrona76d8352016-10-27 16:27:32 +02001088 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1089
1090Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001091
Akron63f20d42017-04-10 23:40:29 +02001092=item B<serial>
1093
1094 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1095
1096Convert archives sequentially. The inputs are not merged but treated
1097as they are (so they may be premerged or globs).
1098the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001099are created based on the archive name. In case the C<--to-tar> flag is given,
1100the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001101
1102
Akron941c1a62016-02-23 17:41:41 +01001103=back
1104
1105
1106=head1 OPTIONS
1107
1108=over 2
1109
Akrona76d8352016-10-27 16:27:32 +02001110=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001111
Akrona76d8352016-10-27 16:27:32 +02001112Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001113
Akron7606afa2016-10-25 16:23:49 +02001114Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001115document, while C<archive> expects a KorAP-XML corpus folder or a zip
1116file to batch process multiple files.
1117C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001118
Akrona76d8352016-10-27 16:27:32 +02001119C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001120that the first archive listed contains all primary data files
1121and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001122
Akron7606afa2016-10-25 16:23:49 +02001123 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001124
Akron821db3d2017-04-06 21:19:31 +02001125Input may also be defined using BSD glob wildcards.
1126
1127 -i 'file/news*.zip'
1128
1129The extended input array will be sorted in length order, so the shortest
1130path needs to contain all primary data files and all meta data files.
1131
Akron0c3e3752016-06-28 15:55:53 +02001132(The directory structure follows the base directory format,
1133that may include a C<.> root folder.
1134In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001135need to be passed with a hash sign in front of the archive's name.
1136This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001137
Akron7606afa2016-10-25 16:23:49 +02001138To support zip files, a version of C<unzip> needs to be installed that is
1139compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001140
Akron7606afa2016-10-25 16:23:49 +02001141B<The root folder switch using the hash sign is experimental and
1142may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001143
Akron63f20d42017-04-10 23:40:29 +02001144=item B<--input-base|-ib> <directory>
1145
1146The base directory for inputs.
1147
1148
Akron941c1a62016-02-23 17:41:41 +01001149=item B<--output|-o> <directory|file>
1150
1151Output folder for archive processing or
1152document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001153writes to C<STDOUT> by default
1154(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001155
1156=item B<--overwrite|-w>
1157
1158Overwrite files that already exist.
1159
Akron3741f8b2016-12-21 19:55:21 +01001160=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001161
1162Define the default tokenization by specifying
1163the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001164of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001165
Akron3741f8b2016-12-21 19:55:21 +01001166
1167=item B<--base-sentences|-bs> <foundry>#<layer>
1168
1169Define the layer for base sentences.
1170If given, this will be used instead of using C<Base#Sentences>.
1171Currently C<DeReKo#Structure> is the only additional layer supported.
1172
1173 Defaults to unset.
1174
1175
1176=item B<--base-paragraphs|-bp> <foundry>#<layer>
1177
1178Define the layer for base paragraphs.
1179If given, this will be used instead of using C<Base#Paragraphs>.
1180Currently C<DeReKo#Structure> is the only additional layer supported.
1181
1182 Defaults to unset.
1183
1184
Akron41ac10b2017-02-08 22:47:25 +01001185=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1186
1187Define the layer for base pagebreaks.
1188Currently C<DeReKo#Structure> is the only layer supported.
1189
1190 Defaults to unset.
1191
1192
Akron941c1a62016-02-23 17:41:41 +01001193=item B<--skip|-s> <foundry>[#<layer>]
1194
Akronf7ad89e2016-03-16 18:22:47 +01001195Skip specific annotations by specifying the foundry
1196(and optionally the layer with a C<#>-prefix),
1197e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001198Can be set multiple times.
1199
Akronc13a1702016-03-15 19:33:14 +01001200=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001201
Akronf7ad89e2016-03-16 18:22:47 +01001202Convert specific annotations by specifying the foundry
1203(and optionally the layer with a C<#>-prefix),
1204e.g. C<Mate> or C<Mate#Morpho>.
1205Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001206
1207=item B<--primary|-p>
1208
Akronc13a1702016-03-15 19:33:14 +01001209Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001210Can be flagged using C<--no-primary> as well.
1211This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001212
1213=item B<--jobs|-j>
1214
1215Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001216for archive processing.
Akron11c80302016-03-18 19:44:43 +01001217Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001218
1219If C<sequential-extraction> is not set to false, this will
1220also apply to extraction.
1221
Akronc11f7982017-02-21 21:20:14 +01001222Pass -1, and the value will be set automatically to 5
1223times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001224This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001225
Akron9ec88872017-04-12 16:29:06 +02001226=item B<--sequential-extraction|-se>
1227
1228Flag to indicate, if the C<jobs> value also applies to extraction.
1229Some systems may have problems with extracting multiple archives
1230to the same folder at the same time.
1231Can be flagged using C<--no-sequential-extraction> as well.
1232Defaults to C<false>.
1233
Akron35db6e32016-03-17 22:42:22 +01001234=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001235
Akron35db6e32016-03-17 22:42:22 +01001236Define the metadata parser to use. Defaults to C<I5>.
1237Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1238This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001239
1240=item B<--pretty|-y>
1241
Akronc13a1702016-03-15 19:33:14 +01001242Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001243This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001244
1245=item B<--gzip|-z>
1246
Akronf7ad89e2016-03-16 18:22:47 +01001247Compress the output.
1248Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001249
Akron11c80302016-03-18 19:44:43 +01001250=item B<--cache|-c>
1251
1252File to mmap a cache (using L<Cache::FastMmap>).
1253Defaults to C<korapxml2krill.cache> in the calling directory.
1254
1255=item B<--cache-size|-cs>
1256
1257Size of the cache. Defaults to C<50m>.
1258
1259=item B<--cache-init|-ci>
1260
1261Initialize cache file.
1262Can be flagged using C<--no-cache-init> as well.
1263Defaults to C<true>.
1264
1265=item B<--cache-delete|-cd>
1266
1267Delete cache file after processing.
1268Can be flagged using C<--no-cache-delete> as well.
1269Defaults to C<true>.
1270
Akron636aa112017-04-07 18:48:56 +02001271=item B<--config|-cfg>
1272
1273Configure the parameters of your call in a file
1274of key-value pairs with whitespace separator
1275
1276 overwrite 1
1277 token DeReKo#Structure
1278 ...
1279
1280Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001281C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001282C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001283C<output>,
1284C<temp-extract>, C<sequential-extraction>,
1285C<base-sentences>, C<base-paragraphs>,
1286C<base-pagebreaks>,
1287C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001288(semicolon separated), C<anno> (semicolon separated).
1289
Akron81500102017-04-07 20:45:44 +02001290=item B<--temporary-extract|-te>
1291
1292Only valid for the C<archive> command.
1293
1294This will first extract all files into a
1295directory and then will archive.
1296If the directory is given as C<:temp:>,
1297a temporary directory is used.
1298This is especially useful to avoid
1299massive unzipping and potential
1300network latency.
Akron636aa112017-04-07 18:48:56 +02001301
Akrone10ad322016-02-27 10:54:26 +01001302=item B<--sigle|-sg>
1303
Akron20807582016-10-26 17:11:34 +02001304Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001305Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001306I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001307Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001308In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001309On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001310
Akron941c1a62016-02-23 17:41:41 +01001311=item B<--log|-l>
1312
1313The L<Log4perl> log level, defaults to C<ERROR>.
1314
1315=item B<--help|-h>
1316
1317Print this document.
1318
1319=item B<--version|-v>
1320
1321Print version information.
1322
1323=back
1324
Akronc13a1702016-03-15 19:33:14 +01001325=head1 ANNOTATION SUPPORT
1326
1327L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1328developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1329The base foundry with paragraphs, sentences, and the text element are mandatory for
1330L<Krill|https://github.com/KorAP/Krill>.
1331
Akron821db3d2017-04-06 21:19:31 +02001332 Base
1333 #Paragraphs
1334 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001335
Akron821db3d2017-04-06 21:19:31 +02001336 Connexor
1337 #Morpho
1338 #Phrase
1339 #Sentences
1340 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001341
Akron821db3d2017-04-06 21:19:31 +02001342 CoreNLP
1343 #Constituency
1344 #Morpho
1345 #NamedEntities
1346 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001347
Akronce125b62017-06-19 11:54:36 +02001348 CMC
1349 #Morpho
1350
Akron821db3d2017-04-06 21:19:31 +02001351 DeReKo
1352 #Structure
Akronc13a1702016-03-15 19:33:14 +01001353
Akron821db3d2017-04-06 21:19:31 +02001354 DRuKoLa
1355 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001356
Akron821db3d2017-04-06 21:19:31 +02001357 Glemm
1358 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001359
Akron4c679192018-01-16 17:41:49 +01001360 LWC
1361 #Dependency
1362
Akron821db3d2017-04-06 21:19:31 +02001363 Malt
1364 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001365
Akron821db3d2017-04-06 21:19:31 +02001366 MarMoT
1367 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001368
Akron821db3d2017-04-06 21:19:31 +02001369 Mate
1370 #Dependency
1371 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001372
Akron821db3d2017-04-06 21:19:31 +02001373 MDParser
1374 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001375
Akron821db3d2017-04-06 21:19:31 +02001376 OpenNLP
1377 #Morpho
1378 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001379
Akron821db3d2017-04-06 21:19:31 +02001380 Sgbr
1381 #Lemma
1382 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001383
Akron821db3d2017-04-06 21:19:31 +02001384 TreeTagger
1385 #Morpho
1386 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001387
Akron821db3d2017-04-06 21:19:31 +02001388 XIP
1389 #Constituency
1390 #Morpho
1391 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001392
Akronc13a1702016-03-15 19:33:14 +01001393
1394More importers are in preparation.
1395New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1396See the built-in annotation importers as examples.
1397
Akron941c1a62016-02-23 17:41:41 +01001398=head1 AVAILABILITY
1399
1400 https://github.com/KorAP/KorAP-XML-Krill
1401
1402
1403=head1 COPYRIGHT AND LICENSE
1404
Akron4c679192018-01-16 17:41:49 +01001405Copyright (C) 2015-2018, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001406
Akron941c1a62016-02-23 17:41:41 +01001407Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001408
Akrona76d8352016-10-27 16:27:32 +02001409Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001410
1411L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1412Corpus Analysis Platform at the
1413L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1414member of the
1415L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1416
1417This program is free software published under the
1418L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1419
1420=cut