blob: bcdadc8f705725340f1819c4dcb296a7dec6e4b2 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron941c1a62016-02-23 17:41:41 +0100129# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100130
Akron4c679192018-01-16 17:41:49 +0100131our $LAST_CHANGE = '2018/01/16';
Akron941c1a62016-02-23 17:41:41 +0100132our $LOCAL = $FindBin::Bin;
133our $VERSION_MSG = <<"VERSION";
134Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
135VERSION
136
Akron63f20d42017-04-10 23:40:29 +0200137# Prototypes
138sub get_file_name_from_glob($);
139sub get_file_name($);
140
Akron941c1a62016-02-23 17:41:41 +0100141# Parse comand
142my $cmd;
143our @ARGV;
144if ($ARGV[0] && index($ARGV[0], '-') != 0) {
145 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100146};
Akron63f20d42017-04-10 23:40:29 +0200147my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100148
Akron5f51d422016-08-16 16:26:43 +0200149my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100150my $text;
Akrone10ad322016-02-27 10:54:26 +0100151
Akron941c1a62016-02-23 17:41:41 +0100152# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000153GetOptions(
Akron08385f62016-03-22 20:37:04 +0100154 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200155 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100156 'output|o=s' => \(my $output),
157 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100158 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200159 'token|t=s' => \(my $token_base),
160 'base-sentences|bs=s' => \(my $base_sentences),
161 'base-paragraphs|bp=s' => \(my $base_paragraphs),
162 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100163 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200164 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100165 'skip|s=s' => \@skip,
166 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200167 'cache|c=s' => \(my $cache_file),
168 'config|cfg=s' => \(my $cfg_file),
169 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200170 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100171 'primary|p!' => \(my $primary),
172 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200173 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200174 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200175 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200176 'cache-size|cs=s' => \(my $cache_size),
177 'cache-delete|cd!' => \(my $cache_delete),
178 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100179 'help|h' => sub {
180 pod2usage(
181 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200182 -verbose => 99,
183 -msg => $VERSION_MSG,
184 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100185 );
186 },
187 'version|v' => sub {
188 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200189 -verbose => 0,
190 -msg => $VERSION_MSG,
191 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100192 )
193 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000194);
195
Akron63f20d42017-04-10 23:40:29 +0200196
Akron636aa112017-04-07 18:48:56 +0200197# Load from configuration
198if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200199 my %config;
200
201 Config::Simple->import_from($cfg_file, \%config);
202
203 # Overwrite
204 if (!defined($overwrite) && defined $config{overwrite}) {
205 $overwrite = $config{overwrite};
206 };
207
208 # Gzip
209 if (!defined($gzip) && defined $config{gzip}) {
210 $gzip = $config{gzip};
211 };
212
213 # Jobs
214 if (!defined($jobs) && defined $config{jobs}) {
215 $jobs = $config{jobs};
216 };
217
Akron63f20d42017-04-10 23:40:29 +0200218 # Input root base directory
219 if (!defined($input_base) && defined $config{'input-base'}) {
220 $input_base = $config{'input-base'};
221 };
222
Akron81500102017-04-07 20:45:44 +0200223 # temporary-extract
224 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
225 $extract_dir = $config{'temporary-extract'};
226 };
227
Akron636aa112017-04-07 18:48:56 +0200228 # Token base
229 if (!defined($token_base) && defined $config{token}) {
230 $token_base = $config{token};
231 };
232
233 # Cache file
234 if (!defined($cache_file) && defined $config{cache}) {
235 $cache_file = $config{cache};
236 };
237
238 # Cache size
239 if (!defined($cache_size) && defined $config{'cache-size'}) {
240 $cache_size = $config{'cache-size'};
241 };
242
243 # Cache delete
244 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
245 $cache_delete = $config{'cache-delete'} ;
246 };
247
248 # Cache init
249 if (!(defined $cache_init) && defined $config{'cache-init'}) {
250 $cache_init = $config{'cache-init'} ;
251 };
252
Akron9ec88872017-04-12 16:29:06 +0200253 # Jobs for extraction
254 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
255 $sequential_extraction = $config{'sequential-extraction'} ;
256 };
257
Akron636aa112017-04-07 18:48:56 +0200258 # Meta
259 if (!(defined $meta) && defined $config{'meta'}) {
260 $meta = $config{'meta'} ;
261 };
262
263 # Output
264 if (!(defined $output) && defined $config{'output'}) {
265 $output = $config{'output'} ;
266 };
267
268 # Base-sentences
269 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
270 $base_sentences = $config{'base-sentences'} ;
271 };
272
273 # Base-paragraphs
274 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
275 $base_paragraphs = $config{'base-paragraphs'} ;
276 };
277
278 # Base-pagebreaks
279 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
280 $base_pagebreaks = $config{'base-pagebreaks'} ;
281 };
282
Akron081639e2017-04-21 19:01:39 +0200283 # Write to tar
284 if (!(defined $to_tar) && defined $config{'to-tar'}) {
285 $to_tar = $config{'to-tar'} ;
286 };
287
Akron636aa112017-04-07 18:48:56 +0200288 # Log
289 if (!(defined $log_level) && defined $config{'log'}) {
290 $log_level = $config{'log'} ;
291 };
292
293 # Skip
294 if (!scalar(@skip) && defined $config{'skip'}) {
295 @skip = split /\s*;\s*/, $config{'skip'} ;
296 };
297
298 # Sigle
299 if (!scalar(@sigle) && defined $config{'sigle'}) {
300 @sigle = split /\s*;\s*/, $config{'sigle'} ;
301 };
302
303 # Anno
304 if (!scalar(@anno) && defined $config{'anno'}) {
305 @anno = split /\s*;\s*/, $config{'anno'} ;
306 };
307};
308
Akron63f20d42017-04-10 23:40:29 +0200309
Akron636aa112017-04-07 18:48:56 +0200310# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200311$token_base //= 'OpenNLP#tokens';
312$cache_file //= 'korapxml2krill.cache';
313$cache_size //= '50m';
314$jobs //= 0;
315$cache_delete //= 1;
316$cache_init //= 1;
317$sequential_extraction //= 0;
318$log_level //= 'ERROR';
319$base_sentences //= '';
320$base_paragraphs //= '';
321$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200322
Akron821db3d2017-04-06 21:19:31 +0200323$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100324$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100325$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100326
Akron63f20d42017-04-10 23:40:29 +0200327
328# Initialize log4perl object
329Log::Log4perl->init({
330 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
331 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
332 'log4perl.appender.STDERR.layout' => 'PatternLayout',
333 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
334});
335
336my $log = Log::Log4perl->get_logger('main');
337
338
339print "Reading config from $cfg_file\n" if $cfg_file;
340
341
Akron941c1a62016-02-23 17:41:41 +0100342my %ERROR_HASH = (
343 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200344 -verbose => 99,
345 -msg => $VERSION_MSG,
346 -output => '-',
347 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100348);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349
Akron941c1a62016-02-23 17:41:41 +0100350# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100351pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352
Akrone1dbc382016-07-08 22:24:52 +0200353# Gzip has no effect, if no output is given
354pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000355
Akronc11f7982017-02-21 21:20:14 +0100356
Akron636aa112017-04-07 18:48:56 +0200357if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100358 state $cores = Sys::Info->new->device('CPU')->count;
359 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200360 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100361};
362
Akron821db3d2017-04-06 21:19:31 +0200363
Akron63f20d42017-04-10 23:40:29 +0200364# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200365if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200366
Akron486f9ab2017-04-22 23:25:19 +0200367 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200368 $log->error("Directory '$output' does not exist.");
369 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200370 };
371
372 # Remove all inputs
373 my $remove_next = 0;
374 @keep_argv = @{c(@keep_argv)->grep(
375 sub {
376 # Input flag
377 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
378 $remove_next = 1;
379 return 0;
380 }
381
382 # input value
383 elsif ($remove_next) {
384 $remove_next = 0;
385 return 0;
386 };
387
388 # Pass parameter
389 return 1;
390 }
391 )->to_array};
392
393
394 # Iterate over all inputs
395 foreach (@input) {
396
Akron081639e2017-04-21 19:01:39 +0200397 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200398 my $new_out = catdir($output, get_file_name_from_glob($_));
399
Akron486f9ab2017-04-22 23:25:19 +0200400 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200401 unless ($to_tar) {
402 if (make_path($new_out) == 0 && !-d $new_out) {
403 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200404 exit 1;
Akron081639e2017-04-21 19:01:39 +0200405 };
Akron63f20d42017-04-10 23:40:29 +0200406 };
407
408 # Create archive command
409 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
410 print "Start serial processing of $_ to $new_out\n";
411
412 # Start archiving
413 system @archive_cmd;
414 };
415
Akron3abc03e2017-06-29 16:23:35 +0200416 exit;
Akron63f20d42017-04-10 23:40:29 +0200417};
418
Akrone1dbc382016-07-08 22:24:52 +0200419my %skip;
420$skip{lc($_)} = 1 foreach @skip;
421
422my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100423push(@layers, ['Base', 'Sentences']) unless $base_sentences;
424push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200425
426# Connexor
427push(@layers, ['Connexor', 'Morpho']);
428push(@layers, ['Connexor', 'Syntax']);
429push(@layers, ['Connexor', 'Phrase']);
430push(@layers, ['Connexor', 'Sentences']);
431
432# CoreNLP
433push(@layers, ['CoreNLP', 'NamedEntities']);
434push(@layers, ['CoreNLP', 'Sentences']);
435push(@layers, ['CoreNLP', 'Morpho']);
436push(@layers, ['CoreNLP', 'Constituency']);
437
Akronce125b62017-06-19 11:54:36 +0200438# CMC
439push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100440
Akrone1dbc382016-07-08 22:24:52 +0200441# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100442my @dereko_attr = ();
443if ($base_sentences eq 'dereko#structure') {
444 push @dereko_attr, 'sentences';
445};
446if ($base_paragraphs eq 'dereko#structure') {
447 push @dereko_attr, 'paragraphs';
448};
Akron636bd9c2017-02-09 17:13:00 +0100449
Akron41ac10b2017-02-08 22:47:25 +0100450if ($base_pagebreaks eq 'dereko#structure') {
451 push @dereko_attr, 'pagebreaks';
452};
453
454if ($dereko_attr[0]) {
455 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100456}
457else {
458 push(@layers, ['DeReKo', 'Structure']);
459};
Akrone1dbc382016-07-08 22:24:52 +0200460
461# Glemm
462push(@layers, ['Glemm', 'Morpho']);
463
Akron4c679192018-01-16 17:41:49 +0100464# LWC
465push(@layers, ['LWC', 'Dependency']);
466
Akrone1dbc382016-07-08 22:24:52 +0200467# Malt
468push(@layers, ['Malt', 'Dependency']);
469
470# MDParser
471push(@layers, ['MDParser', 'Dependency']);
472
473# Mate
474push(@layers, ['Mate', 'Morpho']);
475push(@layers, ['Mate', 'Dependency']);
476
477# OpenNLP
478push(@layers, ['OpenNLP', 'Morpho']);
479push(@layers, ['OpenNLP', 'Sentences']);
480
481# Schreibgebrauch
482push(@layers, ['Sgbr', 'Lemma']);
483push(@layers, ['Sgbr', 'Morpho']);
484
485# TreeTagger
486push(@layers, ['TreeTagger', 'Morpho']);
487push(@layers, ['TreeTagger', 'Sentences']);
488
489# XIP
490push(@layers, ['XIP', 'Morpho']);
491push(@layers, ['XIP', 'Constituency']);
492push(@layers, ['XIP', 'Sentences']);
493push(@layers, ['XIP', 'Dependency']);
494
Akron4fa37c32017-01-20 14:43:10 +0100495# DRuKoLa
496push(@layers, ['DRuKoLa', 'Morpho']);
497
Akron3bd942f2017-02-20 20:09:14 +0100498# Marmot
499push(@layers, ['MarMoT', 'Morpho']);
500
Akron4fa37c32017-01-20 14:43:10 +0100501
Akrone1dbc382016-07-08 22:24:52 +0200502# Check filters
503my @filtered_anno;
504if ($skip{'#all'}) {
505 foreach (@anno) {
506 push @filtered_anno, [ split('#', $_) ];
507 };
508}
509
510# Add all annotations that are not skipped
511else {
512 # Add to index file - respect skipping
513 foreach my $info (@layers) {
514 # Skip if Foundry or Foundry#Layer should be skipped
515 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
516 push @filtered_anno, $info;
517 };
518 };
519};
520
521# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200522my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
523
524# Remove file extension
525$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200526
527# TODO: This should not be initialized for batch
528my $cache = Cache::FastMmap->new(
529 share_file => $cache_file,
530 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200531 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200532);
533
Akron03b24db2016-08-16 20:54:32 +0200534# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200535my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200536 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200537 meta_type => $meta,
538 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200539 foundry => $token_base_foundry,
540 layer => $token_base_layer,
541 gzip => $gzip,
542 log => $log,
543 primary => $primary,
544 pretty => $pretty,
545 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200546);
547
Akron941c1a62016-02-23 17:41:41 +0100548# Get file name based on path information
549sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100550 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200551 if (-d $i) {
552 $i =~ s![^\/]+$!!;
553 };
Akron941c1a62016-02-23 17:41:41 +0100554 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200555
556 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200557 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100558 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100559 $file =~ tr/\//-/;
560 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200561 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100562 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000563};
564
Akron63f20d42017-04-10 23:40:29 +0200565
566sub get_file_name_from_glob ($) {
567 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200568 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200569 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
570 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
571 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
572 $glob =~ s/^-//; # Clean beginning
573 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200574 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200575 return $glob;
576};
577
578
Akrone10ad322016-02-27 10:54:26 +0100579# Convert sigle to path construct
580s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
581
Akron7d4cdd82016-08-17 21:39:45 +0200582if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200583 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200584 $log->error("Directory '$output' does not exist.");
585 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200586 };
587};
588
Akron63f20d42017-04-10 23:40:29 +0200589
590# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200591if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200592
Akron821db3d2017-04-06 21:19:31 +0200593 my @new_input = ();
594
595 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200596 foreach my $wild_card (@input) {
597
598 # Prefix with input root
599 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
600
601 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200602 };
603
Akron63f20d42017-04-10 23:40:29 +0200604 # Sort files by length
605 @input = sort { length($a) <=> length($b) } @new_input;
606
607 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200608};
609
610
Akron941c1a62016-02-23 17:41:41 +0100611# Process a single file
612unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100613 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000614
Akron941c1a62016-02-23 17:41:41 +0100615 BEGIN {
616 $main::TIME = Benchmark->new;
617 $main::LAST_STOP = Benchmark->new;
618 };
619
620 sub stop_time {
621 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200622 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100623 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200624 timestr(timediff($new, $main::LAST_STOP)) .
625 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
626 );
Akron941c1a62016-02-23 17:41:41 +0100627 $main::LAST_STOP = $new;
628 };
629
630 # Create and parse new document
631 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100632
Akron7d4cdd82016-08-17 21:39:45 +0200633 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200634 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100635
Akron11c80302016-03-18 19:44:43 +0100636 # Delete cache file
637 unlink($cache_file) if $cache_delete;
638
Akron5f51d422016-08-16 16:26:43 +0200639 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200640 exit;
Akron81500102017-04-07 20:45:44 +0200641};
642
Nils Diewald59094f22014-11-05 18:20:50 +0000643
Akrone10ad322016-02-27 10:54:26 +0100644# Extract XML files
Akron81500102017-04-07 20:45:44 +0200645if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100646
Akrond5643ad2017-07-04 20:27:13 +0200647 # Output is required
648 pod2usage(%ERROR_HASH) unless $output;
649
Akron7d4cdd82016-08-17 21:39:45 +0200650 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200651 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100652
Akron7d4cdd82016-08-17 21:39:45 +0200653 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100654 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200655 $log->error("Unzip is not installed or incompatible.");
656 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100657 };
658
Akronb0c88db2016-06-29 16:33:18 +0200659 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200660 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200661
Akron651cb8d2016-08-16 21:44:49 +0200662 my $prefix = 1;
663
Akron03b24db2016-08-16 20:54:32 +0200664 # No sigles given
665 unless (@sigle) {
666
667 # Get files
668 foreach ($archive->list_texts) {
669
670 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200671 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200672
673 # TODO: Make this OS independent
674 push @sigle, join '/', $corpus, $doc, $text;
675 };
Akron20807582016-10-26 17:11:34 +0200676 }
677
678 # Check sigle for doc sigles
679 else {
680 my @new_sigle;
681
682 my $prefix_check = 0;
683
684 # Iterate over all sigle
685 foreach (@sigle) {
686
687 # Sigle is a doc sigle
688 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200689
Akron60a8caa2017-02-17 21:51:27 +0100690 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200691 # Check if a prefix is needed
692 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100693
694 if ($prefix = $archive->check_prefix) {
695 print " with prefix ...";
696 };
Akron20807582016-10-26 17:11:34 +0200697 $prefix_check = 1;
698 };
699
Akron60a8caa2017-02-17 21:51:27 +0100700 print "\n";
701
Akron20807582016-10-26 17:11:34 +0200702 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200703 my $path = ($prefix ? './' : '') . $_;
704
705 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200706 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200707 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200708 ) ? '' : 'not '
709 );
710 print "extracted.\n";
711 }
Akron60a8caa2017-02-17 21:51:27 +0100712
713 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200714 else {
715 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100716
717 unless ($prefix_check) {
718
719 if ($prefix = $archive->check_prefix) {
720 print " with prefix ...";
721 };
722 $prefix_check = 1;
723 };
Akron20807582016-10-26 17:11:34 +0200724 };
725 };
726 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200727 };
728
Akrone10ad322016-02-27 10:54:26 +0100729 # Iterate over all given sigles and extract
730 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100731
Akron2812ba22016-10-28 21:55:59 +0200732 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200733
Akron03b24db2016-08-16 20:54:32 +0200734 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200735 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100736
Akron20807582016-10-26 17:11:34 +0200737 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200738 ($prefix ? './' : '') . $_, $output
739 ) ? '' : 'not '
740 );
Akrone10ad322016-02-27 10:54:26 +0100741 print "extracted.\n";
742 };
Akronb0c88db2016-06-29 16:33:18 +0200743 }
Akron7d4cdd82016-08-17 21:39:45 +0200744
745 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200746 else {
747 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200748 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100749 };
750}
751
Akron81500102017-04-07 20:45:44 +0200752
Akron941c1a62016-02-23 17:41:41 +0100753# Process an archive
754elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000755
Akron81500102017-04-07 20:45:44 +0200756 my $archive_output;
757
758 # First extract, then archive
759 if (defined $extract_dir) {
760
761 # Create new archive object
762 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
763
764 # Check zip capabilities
765 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200766 $log->error("Unzip is not installed or incompatible.");
767 exit 1;
Akron81500102017-04-07 20:45:44 +0200768 };
769
770 # Add further annotation archived
771 $archive->attach($_) foreach @input[1..$#input];
772
773 # Create a temporary directory
774 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200775 $extract_dir = tempdir(CLEANUP => 0);
776 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200777 };
778
Akron63f20d42017-04-10 23:40:29 +0200779 # Add some random extra to avoid clashes with multiple archives
780 $extract_dir = catdir($extract_dir, random_string('cccccc'));
781
782 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200783 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200784 @input = ($extract_dir);
785 }
786 else {
787 $log->error('Unable to extract from primary archive ' . $input[0] .
788 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200789 exit 1;
Akron81500102017-04-07 20:45:44 +0200790 };
791 }
792
793 # Can't create archive object
794 else {
795 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200796 exit 1;
Akron81500102017-04-07 20:45:44 +0200797 };
798 };
799
Akrone1dbc382016-07-08 22:24:52 +0200800 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100801
Akron7d4cdd82016-08-17 21:39:45 +0200802 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100803 my $pool = Parallel::ForkManager->new($jobs);
804
Akron7d4cdd82016-08-17 21:39:45 +0200805 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100806 my $iter = 1; # Current text in process
807
Akronda3097e2017-04-23 19:53:57 +0200808 my $tar_archive;
809 my $output_dir = $output;
810 my $tar_fh;
811
812 # Initialize tar archive
813 if ($to_tar) {
814 $tar_archive = Archive::Tar::Builder->new(
815 ignore_errors => 1
816 );
817
818 # Set output name
819 my $tar_file = $output;
820 unless ($tar_file =~ /\.tar$/) {
821 $tar_file .= '.tar';
822 };
823
824 # Initiate the tar file
825 print "Writing to file $tar_file\n";
826 $tar_fh = IO::File->new($tar_file, 'w');
827 $tar_fh->binmode(1);
828
829 # Set handle
830 $tar_archive->set_handle($tar_fh);
831
832 # Output to temporary directory
833 $output_dir = File::Temp->newdir;
834 };
835
Akron941c1a62016-02-23 17:41:41 +0100836 # Report on fork message
837 $pool->run_on_finish (
838 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200839 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100840 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200841
Akron08385f62016-03-22 20:37:04 +0100842 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200843 ($iter++) . "/$count]" .
844 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200845 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200846
847 if (!$code && $to_tar && $data->[2]) {
848 my $filename = $data->[2];
849
850 # Lock filehandle
851 if (flock($tar_fh, LOCK_EX)) {
852
Akron9a062ce2017-07-04 19:12:05 +0200853 my $clean_file = fileparse($filename);
854
Akronda3097e2017-04-23 19:53:57 +0200855 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200856 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200857 unlink $filename;
858
859 # Unlock filehandle
860 flock($tar_fh, LOCK_UN);
861 }
862 else {
863 $log->warn("Unable to add $filename to archive");
864 };
865 };
866
Akron4c0cf312016-10-15 16:42:09 +0200867 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100868 }
869 );
870
871 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200872 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100873 print "Reading data ...\n";
874
Akron7d4cdd82016-08-17 21:39:45 +0200875 # unless (Cache::FastMmap->new(
876 # share_file => $cache_file,
877 # cache_size => $cache_size,
878 # init_file => $cache_init
879 # )) {
880 # print "Unable to intialize cache '$cache_file'\n\n";
881 # exit(1);
882 # };
Akron11c80302016-03-18 19:44:43 +0100883
Akron486f9ab2017-04-22 23:25:19 +0200884
Akron941c1a62016-02-23 17:41:41 +0100885 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100886 if (-d $input[0]) {
887 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100888 my @dirs;
889 my $dir;
890
Akron7d4cdd82016-08-17 21:39:45 +0200891 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100892 while (1) {
893 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200894 push @dirs, $dir;
895 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100896 };
897 last unless $it->next;
898 };
899
900 print "Start processing ...\n";
901 $t = Benchmark->new;
902 $count = scalar @dirs;
903
904 DIRECTORY_LOOP:
905 for (my $i = 0; $i < $count; $i++) {
906
Akrone1dbc382016-07-08 22:24:52 +0200907 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200908 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200909 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200910 );
Akron941c1a62016-02-23 17:41:41 +0100911
912 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200913 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200914
Akron13d56622016-10-31 14:54:49 +0100915 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200916 $pool->finish(
917 0,
Akronda3097e2017-04-23 19:53:57 +0200918 [
919 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
920 undef,
921 $filename
922 ]
Akron486f9ab2017-04-22 23:25:19 +0200923 );
Akron3ec48972016-08-17 23:24:52 +0200924 }
925 else {
Akron4c0cf312016-10-15 16:42:09 +0200926 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200927 };
Akron941c1a62016-02-23 17:41:41 +0100928 };
929 }
930
931 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200932 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200933
Akron941c1a62016-02-23 17:41:41 +0100934 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200935 $log->error("Unzip is not installed or incompatible.");
936 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100937 };
938
Akron08385f62016-03-22 20:37:04 +0100939 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200940 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100941
Akron941c1a62016-02-23 17:41:41 +0100942 print "Start processing ...\n";
943 $t = Benchmark->new;
944 my @dirs = $archive->list_texts;
945 $count = scalar @dirs;
946
947 ARCHIVE_LOOP:
948 for (my $i = 0; $i < $count; $i++) {
949
950 # Split path information
951 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
952
Akrone1dbc382016-07-08 22:24:52 +0200953 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200954 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200955 get_file_name(
956 catfile($corpus, $doc, $text)
957 . '.json' . ($gzip ? '.gz' : '')
958 )
Akrone1dbc382016-07-08 22:24:52 +0200959 );
Akron941c1a62016-02-23 17:41:41 +0100960
961 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200962 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100963
Akron4c0cf312016-10-15 16:42:09 +0200964 # Create temporary file
965 $temp = File::Temp->newdir;
966
Akronbdf434a2016-10-24 17:42:07 +0200967 # TODO: Check if $filename exist at the beginning,
968 # because extraction can be horrible slow!
969
Akron941c1a62016-02-23 17:41:41 +0100970 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200971 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100972
Akron7d4cdd82016-08-17 21:39:45 +0200973 # Create corpus directory
974 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100975
Akron7d4cdd82016-08-17 21:39:45 +0200976 # Temporary directory
977 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100978
Akron7d4cdd82016-08-17 21:39:45 +0200979 # Write file
Akron13d56622016-10-31 14:54:49 +0100980 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200981
Akron4c0cf312016-10-15 16:42:09 +0200982 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100983 $pool->finish(
984 0,
Akronda3097e2017-04-23 19:53:57 +0200985 [
986 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
987 $temp,
988 $filename
989 ]
Akron13d56622016-10-31 14:54:49 +0100990 );
991 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200992 }
993 else {
Akron4c0cf312016-10-15 16:42:09 +0200994 # Delete temporary file
995 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200996 };
Akron941c1a62016-02-23 17:41:41 +0100997 }
Akron7d4cdd82016-08-17 21:39:45 +0200998
999 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001000 else {
Akron4c0cf312016-10-15 16:42:09 +02001001 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001002 };
1003 };
1004 }
1005
1006 else {
1007 print "Input is neither a directory nor an archive.\n\n";
1008 };
1009
1010 $pool->wait_all_children;
1011
Akron11c80302016-03-18 19:44:43 +01001012 # Delete cache file
1013 unlink($cache_file) if $cache_delete;
1014
Akronda3097e2017-04-23 19:53:57 +02001015 # Close tar filehandle
1016 if ($to_tar && $tar_fh) {
1017 $tar_archive->finish;
1018 $tar_fh->close;
1019 print "Wrote to tar archive.\n";
1020 };
1021
Akron63f20d42017-04-10 23:40:29 +02001022 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001023 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001024};
Akron941c1a62016-02-23 17:41:41 +01001025
Nils Diewald2db9ad02013-10-29 19:26:43 +00001026
Akron63f20d42017-04-10 23:40:29 +02001027# Cleanup temporary extraction directory
1028if ($extract_dir) {
1029 my $objects = remove_tree($extract_dir, { safe => 1 });
1030 print "Removed directory $extract_dir with $objects objects.\n";
1031};
1032
1033
1034print "\n";
1035
Nils Diewald2db9ad02013-10-29 19:26:43 +00001036__END__
Akron941c1a62016-02-23 17:41:41 +01001037
1038=pod
1039
1040=encoding utf8
1041
1042=head1 NAME
1043
Akronf7ad89e2016-03-16 18:22:47 +01001044korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001045
1046
1047=head1 SYNOPSIS
1048
Akrona76d8352016-10-27 16:27:32 +02001049 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001050
Akron2fd402b2016-10-27 21:26:48 +02001051
Akron941c1a62016-02-23 17:41:41 +01001052=head1 DESCRIPTION
1053
1054L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1055compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001056The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001057
1058
1059=head1 INSTALLATION
1060
1061The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1062
Akronaf386982016-10-12 00:33:25 +02001063 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001064
Akronc13a1702016-03-15 19:33:14 +01001065In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001066be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001067Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001068In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001069
1070=head1 ARGUMENTS
1071
Akrona76d8352016-10-27 16:27:32 +02001072 $ korapxml2krill -z --input <directory> --output <filename>
1073
1074Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001075It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001076
Akron941c1a62016-02-23 17:41:41 +01001077=over 2
1078
1079=item B<archive>
1080
Akron081639e2017-04-21 19:01:39 +02001081 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001082
Akron2fd402b2016-10-27 21:26:48 +02001083Converts an archive of KorAP-XML documents. It expects a directory
1084(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001085
1086=item B<extract>
1087
Akrona76d8352016-10-27 16:27:32 +02001088 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1089
1090Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001091
Akron63f20d42017-04-10 23:40:29 +02001092=item B<serial>
1093
1094 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1095
1096Convert archives sequentially. The inputs are not merged but treated
1097as they are (so they may be premerged or globs).
1098the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001099are created based on the archive name. In case the C<--to-tar> flag is given,
1100the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001101
1102
Akron941c1a62016-02-23 17:41:41 +01001103=back
1104
1105
1106=head1 OPTIONS
1107
1108=over 2
1109
Akrona76d8352016-10-27 16:27:32 +02001110=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001111
Akrona76d8352016-10-27 16:27:32 +02001112Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001113
Akron7606afa2016-10-25 16:23:49 +02001114Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001115document, while C<archive> expects a KorAP-XML corpus folder or a zip
1116file to batch process multiple files.
1117C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001118
Akrona76d8352016-10-27 16:27:32 +02001119C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001120that the first archive listed contains all primary data files
1121and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001122
Akron7606afa2016-10-25 16:23:49 +02001123 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001124
Akron821db3d2017-04-06 21:19:31 +02001125Input may also be defined using BSD glob wildcards.
1126
1127 -i 'file/news*.zip'
1128
1129The extended input array will be sorted in length order, so the shortest
1130path needs to contain all primary data files and all meta data files.
1131
Akron0c3e3752016-06-28 15:55:53 +02001132(The directory structure follows the base directory format,
1133that may include a C<.> root folder.
1134In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001135need to be passed with a hash sign in front of the archive's name.
1136This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001137
Akron7606afa2016-10-25 16:23:49 +02001138To support zip files, a version of C<unzip> needs to be installed that is
1139compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001140
Akron7606afa2016-10-25 16:23:49 +02001141B<The root folder switch using the hash sign is experimental and
1142may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001143
Akronf73ffb62018-06-27 12:13:59 +02001144
Akron63f20d42017-04-10 23:40:29 +02001145=item B<--input-base|-ib> <directory>
1146
1147The base directory for inputs.
1148
1149
Akron941c1a62016-02-23 17:41:41 +01001150=item B<--output|-o> <directory|file>
1151
1152Output folder for archive processing or
1153document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001154writes to C<STDOUT> by default
1155(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001156
1157=item B<--overwrite|-w>
1158
1159Overwrite files that already exist.
1160
Akronf73ffb62018-06-27 12:13:59 +02001161
Akron3741f8b2016-12-21 19:55:21 +01001162=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001163
1164Define the default tokenization by specifying
1165the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001166of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001167
Akron3741f8b2016-12-21 19:55:21 +01001168
1169=item B<--base-sentences|-bs> <foundry>#<layer>
1170
1171Define the layer for base sentences.
1172If given, this will be used instead of using C<Base#Sentences>.
1173Currently C<DeReKo#Structure> is the only additional layer supported.
1174
1175 Defaults to unset.
1176
1177
1178=item B<--base-paragraphs|-bp> <foundry>#<layer>
1179
1180Define the layer for base paragraphs.
1181If given, this will be used instead of using C<Base#Paragraphs>.
1182Currently C<DeReKo#Structure> is the only additional layer supported.
1183
1184 Defaults to unset.
1185
1186
Akron41ac10b2017-02-08 22:47:25 +01001187=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1188
1189Define the layer for base pagebreaks.
1190Currently C<DeReKo#Structure> is the only layer supported.
1191
1192 Defaults to unset.
1193
1194
Akron941c1a62016-02-23 17:41:41 +01001195=item B<--skip|-s> <foundry>[#<layer>]
1196
Akronf7ad89e2016-03-16 18:22:47 +01001197Skip specific annotations by specifying the foundry
1198(and optionally the layer with a C<#>-prefix),
1199e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001200Can be set multiple times.
1201
Akronf73ffb62018-06-27 12:13:59 +02001202
Akronc13a1702016-03-15 19:33:14 +01001203=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001204
Akronf7ad89e2016-03-16 18:22:47 +01001205Convert specific annotations by specifying the foundry
1206(and optionally the layer with a C<#>-prefix),
1207e.g. C<Mate> or C<Mate#Morpho>.
1208Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001209
Akronf73ffb62018-06-27 12:13:59 +02001210
Akron941c1a62016-02-23 17:41:41 +01001211=item B<--primary|-p>
1212
Akronc13a1702016-03-15 19:33:14 +01001213Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001214Can be flagged using C<--no-primary> as well.
1215This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001216
Akronf73ffb62018-06-27 12:13:59 +02001217
Akron941c1a62016-02-23 17:41:41 +01001218=item B<--jobs|-j>
1219
1220Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001221for archive processing.
Akron11c80302016-03-18 19:44:43 +01001222Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001223
1224If C<sequential-extraction> is not set to false, this will
1225also apply to extraction.
1226
Akronc11f7982017-02-21 21:20:14 +01001227Pass -1, and the value will be set automatically to 5
1228times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001229This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001230
Akronf73ffb62018-06-27 12:13:59 +02001231
Akron9ec88872017-04-12 16:29:06 +02001232=item B<--sequential-extraction|-se>
1233
1234Flag to indicate, if the C<jobs> value also applies to extraction.
1235Some systems may have problems with extracting multiple archives
1236to the same folder at the same time.
1237Can be flagged using C<--no-sequential-extraction> as well.
1238Defaults to C<false>.
1239
Akronf73ffb62018-06-27 12:13:59 +02001240
Akron35db6e32016-03-17 22:42:22 +01001241=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001242
Akron35db6e32016-03-17 22:42:22 +01001243Define the metadata parser to use. Defaults to C<I5>.
1244Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1245This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001246
Akronf73ffb62018-06-27 12:13:59 +02001247
Akron941c1a62016-02-23 17:41:41 +01001248=item B<--pretty|-y>
1249
Akronc13a1702016-03-15 19:33:14 +01001250Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001251This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001252
Akronf73ffb62018-06-27 12:13:59 +02001253
Akron941c1a62016-02-23 17:41:41 +01001254=item B<--gzip|-z>
1255
Akronf7ad89e2016-03-16 18:22:47 +01001256Compress the output.
1257Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001258
Akronf73ffb62018-06-27 12:13:59 +02001259
Akron11c80302016-03-18 19:44:43 +01001260=item B<--cache|-c>
1261
1262File to mmap a cache (using L<Cache::FastMmap>).
1263Defaults to C<korapxml2krill.cache> in the calling directory.
1264
Akronf73ffb62018-06-27 12:13:59 +02001265
Akron11c80302016-03-18 19:44:43 +01001266=item B<--cache-size|-cs>
1267
1268Size of the cache. Defaults to C<50m>.
1269
Akronf73ffb62018-06-27 12:13:59 +02001270
Akron11c80302016-03-18 19:44:43 +01001271=item B<--cache-init|-ci>
1272
1273Initialize cache file.
1274Can be flagged using C<--no-cache-init> as well.
1275Defaults to C<true>.
1276
Akronf73ffb62018-06-27 12:13:59 +02001277
Akron11c80302016-03-18 19:44:43 +01001278=item B<--cache-delete|-cd>
1279
1280Delete cache file after processing.
1281Can be flagged using C<--no-cache-delete> as well.
1282Defaults to C<true>.
1283
Akronf73ffb62018-06-27 12:13:59 +02001284
Akron636aa112017-04-07 18:48:56 +02001285=item B<--config|-cfg>
1286
1287Configure the parameters of your call in a file
1288of key-value pairs with whitespace separator
1289
1290 overwrite 1
1291 token DeReKo#Structure
1292 ...
1293
1294Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001295C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001296C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001297C<output>,
1298C<temp-extract>, C<sequential-extraction>,
1299C<base-sentences>, C<base-paragraphs>,
1300C<base-pagebreaks>,
1301C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001302(semicolon separated), C<anno> (semicolon separated).
1303
Akronf73ffb62018-06-27 12:13:59 +02001304Configuration parameters will always be overwritten by
1305passed parameters.
1306
1307
Akron81500102017-04-07 20:45:44 +02001308=item B<--temporary-extract|-te>
1309
1310Only valid for the C<archive> command.
1311
1312This will first extract all files into a
1313directory and then will archive.
1314If the directory is given as C<:temp:>,
1315a temporary directory is used.
1316This is especially useful to avoid
1317massive unzipping and potential
1318network latency.
Akron636aa112017-04-07 18:48:56 +02001319
Akronf73ffb62018-06-27 12:13:59 +02001320
Akrone10ad322016-02-27 10:54:26 +01001321=item B<--sigle|-sg>
1322
Akron20807582016-10-26 17:11:34 +02001323Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001324Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001325I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001326Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001327In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001328On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akron941c1a62016-02-23 17:41:41 +01001331=item B<--log|-l>
1332
1333The L<Log4perl> log level, defaults to C<ERROR>.
1334
Akronf73ffb62018-06-27 12:13:59 +02001335
Akron941c1a62016-02-23 17:41:41 +01001336=item B<--help|-h>
1337
1338Print this document.
1339
Akronf73ffb62018-06-27 12:13:59 +02001340
Akron941c1a62016-02-23 17:41:41 +01001341=item B<--version|-v>
1342
1343Print version information.
1344
1345=back
1346
Akronf73ffb62018-06-27 12:13:59 +02001347
Akronc13a1702016-03-15 19:33:14 +01001348=head1 ANNOTATION SUPPORT
1349
1350L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1351developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1352The base foundry with paragraphs, sentences, and the text element are mandatory for
1353L<Krill|https://github.com/KorAP/Krill>.
1354
Akron821db3d2017-04-06 21:19:31 +02001355 Base
1356 #Paragraphs
1357 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001358
Akron821db3d2017-04-06 21:19:31 +02001359 Connexor
1360 #Morpho
1361 #Phrase
1362 #Sentences
1363 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001364
Akron821db3d2017-04-06 21:19:31 +02001365 CoreNLP
1366 #Constituency
1367 #Morpho
1368 #NamedEntities
1369 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001370
Akronce125b62017-06-19 11:54:36 +02001371 CMC
1372 #Morpho
1373
Akron821db3d2017-04-06 21:19:31 +02001374 DeReKo
1375 #Structure
Akronc13a1702016-03-15 19:33:14 +01001376
Akron821db3d2017-04-06 21:19:31 +02001377 DRuKoLa
1378 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001379
Akron821db3d2017-04-06 21:19:31 +02001380 Glemm
1381 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001382
Akron4c679192018-01-16 17:41:49 +01001383 LWC
1384 #Dependency
1385
Akron821db3d2017-04-06 21:19:31 +02001386 Malt
1387 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001388
Akron821db3d2017-04-06 21:19:31 +02001389 MarMoT
1390 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001391
Akron821db3d2017-04-06 21:19:31 +02001392 Mate
1393 #Dependency
1394 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001395
Akron821db3d2017-04-06 21:19:31 +02001396 MDParser
1397 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001398
Akron821db3d2017-04-06 21:19:31 +02001399 OpenNLP
1400 #Morpho
1401 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001402
Akron821db3d2017-04-06 21:19:31 +02001403 Sgbr
1404 #Lemma
1405 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001406
Akron821db3d2017-04-06 21:19:31 +02001407 TreeTagger
1408 #Morpho
1409 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001410
Akron821db3d2017-04-06 21:19:31 +02001411 XIP
1412 #Constituency
1413 #Morpho
1414 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001415
Akronc13a1702016-03-15 19:33:14 +01001416
1417More importers are in preparation.
1418New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1419See the built-in annotation importers as examples.
1420
Akronf73ffb62018-06-27 12:13:59 +02001421
Akron941c1a62016-02-23 17:41:41 +01001422=head1 AVAILABILITY
1423
1424 https://github.com/KorAP/KorAP-XML-Krill
1425
1426
1427=head1 COPYRIGHT AND LICENSE
1428
Akron4c679192018-01-16 17:41:49 +01001429Copyright (C) 2015-2018, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001430
Akron941c1a62016-02-23 17:41:41 +01001431Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001432
Akrona76d8352016-10-27 16:27:32 +02001433Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001434
1435L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1436Corpus Analysis Platform at the
1437L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1438member of the
1439L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1440
1441This program is free software published under the
1442L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1443
1444=cut