blob: 98a41bec00c4b172e384065e47c24216db77754a [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
27use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020029use IO::File;
30use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron941c1a62016-02-23 17:41:41 +0100122# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100123
Akron3abc03e2017-06-29 16:23:35 +0200124our $LAST_CHANGE = '2017/06/29';
Akron941c1a62016-02-23 17:41:41 +0100125our $LOCAL = $FindBin::Bin;
126our $VERSION_MSG = <<"VERSION";
127Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
128VERSION
129
Akron63f20d42017-04-10 23:40:29 +0200130# Prototypes
131sub get_file_name_from_glob($);
132sub get_file_name($);
133
Akron941c1a62016-02-23 17:41:41 +0100134# Parse comand
135my $cmd;
136our @ARGV;
137if ($ARGV[0] && index($ARGV[0], '-') != 0) {
138 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100139};
Akron63f20d42017-04-10 23:40:29 +0200140my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100141
Akron5f51d422016-08-16 16:26:43 +0200142my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100143my $text;
Akrone10ad322016-02-27 10:54:26 +0100144
Akron941c1a62016-02-23 17:41:41 +0100145# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000146GetOptions(
Akron08385f62016-03-22 20:37:04 +0100147 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200148 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100149 'output|o=s' => \(my $output),
150 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100151 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200152 'token|t=s' => \(my $token_base),
153 'base-sentences|bs=s' => \(my $base_sentences),
154 'base-paragraphs|bp=s' => \(my $base_paragraphs),
155 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100156 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200157 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100158 'skip|s=s' => \@skip,
159 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200160 'cache|c=s' => \(my $cache_file),
161 'config|cfg=s' => \(my $cfg_file),
162 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200163 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100164 'primary|p!' => \(my $primary),
165 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200166 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200167 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200168 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200169 'cache-size|cs=s' => \(my $cache_size),
170 'cache-delete|cd!' => \(my $cache_delete),
171 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100172 'help|h' => sub {
173 pod2usage(
174 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200175 -verbose => 99,
176 -msg => $VERSION_MSG,
177 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100178 );
179 },
180 'version|v' => sub {
181 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200182 -verbose => 0,
183 -msg => $VERSION_MSG,
184 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100185 )
186 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000187);
188
Akron63f20d42017-04-10 23:40:29 +0200189
Akron636aa112017-04-07 18:48:56 +0200190# Load from configuration
191if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200192 my %config;
193
194 Config::Simple->import_from($cfg_file, \%config);
195
196 # Overwrite
197 if (!defined($overwrite) && defined $config{overwrite}) {
198 $overwrite = $config{overwrite};
199 };
200
201 # Gzip
202 if (!defined($gzip) && defined $config{gzip}) {
203 $gzip = $config{gzip};
204 };
205
206 # Jobs
207 if (!defined($jobs) && defined $config{jobs}) {
208 $jobs = $config{jobs};
209 };
210
Akron63f20d42017-04-10 23:40:29 +0200211 # Input root base directory
212 if (!defined($input_base) && defined $config{'input-base'}) {
213 $input_base = $config{'input-base'};
214 };
215
Akron81500102017-04-07 20:45:44 +0200216 # temporary-extract
217 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
218 $extract_dir = $config{'temporary-extract'};
219 };
220
Akron636aa112017-04-07 18:48:56 +0200221 # Token base
222 if (!defined($token_base) && defined $config{token}) {
223 $token_base = $config{token};
224 };
225
226 # Cache file
227 if (!defined($cache_file) && defined $config{cache}) {
228 $cache_file = $config{cache};
229 };
230
231 # Cache size
232 if (!defined($cache_size) && defined $config{'cache-size'}) {
233 $cache_size = $config{'cache-size'};
234 };
235
236 # Cache delete
237 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
238 $cache_delete = $config{'cache-delete'} ;
239 };
240
241 # Cache init
242 if (!(defined $cache_init) && defined $config{'cache-init'}) {
243 $cache_init = $config{'cache-init'} ;
244 };
245
Akron9ec88872017-04-12 16:29:06 +0200246 # Jobs for extraction
247 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
248 $sequential_extraction = $config{'sequential-extraction'} ;
249 };
250
Akron636aa112017-04-07 18:48:56 +0200251 # Meta
252 if (!(defined $meta) && defined $config{'meta'}) {
253 $meta = $config{'meta'} ;
254 };
255
256 # Output
257 if (!(defined $output) && defined $config{'output'}) {
258 $output = $config{'output'} ;
259 };
260
261 # Base-sentences
262 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
263 $base_sentences = $config{'base-sentences'} ;
264 };
265
266 # Base-paragraphs
267 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
268 $base_paragraphs = $config{'base-paragraphs'} ;
269 };
270
271 # Base-pagebreaks
272 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
273 $base_pagebreaks = $config{'base-pagebreaks'} ;
274 };
275
Akron081639e2017-04-21 19:01:39 +0200276 # Write to tar
277 if (!(defined $to_tar) && defined $config{'to-tar'}) {
278 $to_tar = $config{'to-tar'} ;
279 };
280
Akron636aa112017-04-07 18:48:56 +0200281 # Log
282 if (!(defined $log_level) && defined $config{'log'}) {
283 $log_level = $config{'log'} ;
284 };
285
286 # Skip
287 if (!scalar(@skip) && defined $config{'skip'}) {
288 @skip = split /\s*;\s*/, $config{'skip'} ;
289 };
290
291 # Sigle
292 if (!scalar(@sigle) && defined $config{'sigle'}) {
293 @sigle = split /\s*;\s*/, $config{'sigle'} ;
294 };
295
296 # Anno
297 if (!scalar(@anno) && defined $config{'anno'}) {
298 @anno = split /\s*;\s*/, $config{'anno'} ;
299 };
300};
301
Akron63f20d42017-04-10 23:40:29 +0200302
Akron636aa112017-04-07 18:48:56 +0200303# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200304$token_base //= 'OpenNLP#tokens';
305$cache_file //= 'korapxml2krill.cache';
306$cache_size //= '50m';
307$jobs //= 0;
308$cache_delete //= 1;
309$cache_init //= 1;
310$sequential_extraction //= 0;
311$log_level //= 'ERROR';
312$base_sentences //= '';
313$base_paragraphs //= '';
314$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200315
Akron821db3d2017-04-06 21:19:31 +0200316$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100317$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100318$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100319
Akron63f20d42017-04-10 23:40:29 +0200320
321# Initialize log4perl object
322Log::Log4perl->init({
323 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
324 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
325 'log4perl.appender.STDERR.layout' => 'PatternLayout',
326 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
327});
328
329my $log = Log::Log4perl->get_logger('main');
330
331
332print "Reading config from $cfg_file\n" if $cfg_file;
333
334
Akron941c1a62016-02-23 17:41:41 +0100335my %ERROR_HASH = (
336 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200337 -verbose => 99,
338 -msg => $VERSION_MSG,
339 -output => '-',
340 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100341);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000342
Akron941c1a62016-02-23 17:41:41 +0100343# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100344pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000345
Akrone1dbc382016-07-08 22:24:52 +0200346# Gzip has no effect, if no output is given
347pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000348
Akronc11f7982017-02-21 21:20:14 +0100349
Akron636aa112017-04-07 18:48:56 +0200350if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100351 state $cores = Sys::Info->new->device('CPU')->count;
352 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200353 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100354};
355
Akron821db3d2017-04-06 21:19:31 +0200356
Akron63f20d42017-04-10 23:40:29 +0200357# Start serial processing
358if ($cmd eq 'serial') {
359
Akron486f9ab2017-04-22 23:25:19 +0200360 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200361 $log->error("Directory '$output' does not exist.");
362 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200363 };
364
365 # Remove all inputs
366 my $remove_next = 0;
367 @keep_argv = @{c(@keep_argv)->grep(
368 sub {
369 # Input flag
370 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
371 $remove_next = 1;
372 return 0;
373 }
374
375 # input value
376 elsif ($remove_next) {
377 $remove_next = 0;
378 return 0;
379 };
380
381 # Pass parameter
382 return 1;
383 }
384 )->to_array};
385
386
387 # Iterate over all inputs
388 foreach (@input) {
389
Akron081639e2017-04-21 19:01:39 +0200390 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200391 my $new_out = catdir($output, get_file_name_from_glob($_));
392
Akron486f9ab2017-04-22 23:25:19 +0200393 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200394 unless ($to_tar) {
395 if (make_path($new_out) == 0 && !-d $new_out) {
396 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200397 exit 1;
Akron081639e2017-04-21 19:01:39 +0200398 };
Akron63f20d42017-04-10 23:40:29 +0200399 };
400
401 # Create archive command
402 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
403 print "Start serial processing of $_ to $new_out\n";
404
405 # Start archiving
406 system @archive_cmd;
407 };
408
Akron3abc03e2017-06-29 16:23:35 +0200409 exit;
Akron63f20d42017-04-10 23:40:29 +0200410};
411
Akrone1dbc382016-07-08 22:24:52 +0200412my %skip;
413$skip{lc($_)} = 1 foreach @skip;
414
415my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100416push(@layers, ['Base', 'Sentences']) unless $base_sentences;
417push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200418
419# Connexor
420push(@layers, ['Connexor', 'Morpho']);
421push(@layers, ['Connexor', 'Syntax']);
422push(@layers, ['Connexor', 'Phrase']);
423push(@layers, ['Connexor', 'Sentences']);
424
425# CoreNLP
426push(@layers, ['CoreNLP', 'NamedEntities']);
427push(@layers, ['CoreNLP', 'Sentences']);
428push(@layers, ['CoreNLP', 'Morpho']);
429push(@layers, ['CoreNLP', 'Constituency']);
430
Akronce125b62017-06-19 11:54:36 +0200431# CMC
432push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100433
Akrone1dbc382016-07-08 22:24:52 +0200434# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100435my @dereko_attr = ();
436if ($base_sentences eq 'dereko#structure') {
437 push @dereko_attr, 'sentences';
438};
439if ($base_paragraphs eq 'dereko#structure') {
440 push @dereko_attr, 'paragraphs';
441};
Akron636bd9c2017-02-09 17:13:00 +0100442
Akron41ac10b2017-02-08 22:47:25 +0100443if ($base_pagebreaks eq 'dereko#structure') {
444 push @dereko_attr, 'pagebreaks';
445};
446
447if ($dereko_attr[0]) {
448 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100449}
450else {
451 push(@layers, ['DeReKo', 'Structure']);
452};
Akrone1dbc382016-07-08 22:24:52 +0200453
454# Glemm
455push(@layers, ['Glemm', 'Morpho']);
456
457# Malt
458push(@layers, ['Malt', 'Dependency']);
459
460# MDParser
461push(@layers, ['MDParser', 'Dependency']);
462
463# Mate
464push(@layers, ['Mate', 'Morpho']);
465push(@layers, ['Mate', 'Dependency']);
466
467# OpenNLP
468push(@layers, ['OpenNLP', 'Morpho']);
469push(@layers, ['OpenNLP', 'Sentences']);
470
471# Schreibgebrauch
472push(@layers, ['Sgbr', 'Lemma']);
473push(@layers, ['Sgbr', 'Morpho']);
474
475# TreeTagger
476push(@layers, ['TreeTagger', 'Morpho']);
477push(@layers, ['TreeTagger', 'Sentences']);
478
479# XIP
480push(@layers, ['XIP', 'Morpho']);
481push(@layers, ['XIP', 'Constituency']);
482push(@layers, ['XIP', 'Sentences']);
483push(@layers, ['XIP', 'Dependency']);
484
Akron4fa37c32017-01-20 14:43:10 +0100485# DRuKoLa
486push(@layers, ['DRuKoLa', 'Morpho']);
487
Akron3bd942f2017-02-20 20:09:14 +0100488# Marmot
489push(@layers, ['MarMoT', 'Morpho']);
490
Akron4fa37c32017-01-20 14:43:10 +0100491
Akrone1dbc382016-07-08 22:24:52 +0200492# Check filters
493my @filtered_anno;
494if ($skip{'#all'}) {
495 foreach (@anno) {
496 push @filtered_anno, [ split('#', $_) ];
497 };
498}
499
500# Add all annotations that are not skipped
501else {
502 # Add to index file - respect skipping
503 foreach my $info (@layers) {
504 # Skip if Foundry or Foundry#Layer should be skipped
505 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
506 push @filtered_anno, $info;
507 };
508 };
509};
510
511# Get tokenization basis
512my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
513
514# TODO: This should not be initialized for batch
515my $cache = Cache::FastMmap->new(
516 share_file => $cache_file,
517 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200518 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200519);
520
Akron03b24db2016-08-16 20:54:32 +0200521# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200522my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200523 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200524 meta_type => $meta,
525 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200526 foundry => $token_base_foundry,
527 layer => $token_base_layer,
528 gzip => $gzip,
529 log => $log,
530 primary => $primary,
531 pretty => $pretty,
532 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200533);
534
Akron941c1a62016-02-23 17:41:41 +0100535# Get file name based on path information
536sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100537 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200538 if (-d $i) {
539 $i =~ s![^\/]+$!!;
540 };
Akron941c1a62016-02-23 17:41:41 +0100541 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200542
543 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200544 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100545 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100546 $file =~ tr/\//-/;
547 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200548 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100549 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000550};
551
Akron63f20d42017-04-10 23:40:29 +0200552
553sub get_file_name_from_glob ($) {
554 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200555 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200556 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
557 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
558 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
559 $glob =~ s/^-//; # Clean beginning
560 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200561 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200562 return $glob;
563};
564
565
Akrone10ad322016-02-27 10:54:26 +0100566# Convert sigle to path construct
567s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
568
Akron7d4cdd82016-08-17 21:39:45 +0200569if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200570 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200571 $log->error("Directory '$output' does not exist.");
572 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200573 };
574};
575
Akron63f20d42017-04-10 23:40:29 +0200576
577# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200578if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200579
Akron821db3d2017-04-06 21:19:31 +0200580 my @new_input = ();
581
582 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200583 foreach my $wild_card (@input) {
584
585 # Prefix with input root
586 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
587
588 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200589 };
590
Akron63f20d42017-04-10 23:40:29 +0200591 # Sort files by length
592 @input = sort { length($a) <=> length($b) } @new_input;
593
594 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200595};
596
597
Akron941c1a62016-02-23 17:41:41 +0100598# Process a single file
599unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100600 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000601
Akron941c1a62016-02-23 17:41:41 +0100602 BEGIN {
603 $main::TIME = Benchmark->new;
604 $main::LAST_STOP = Benchmark->new;
605 };
606
607 sub stop_time {
608 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200609 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100610 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200611 timestr(timediff($new, $main::LAST_STOP)) .
612 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
613 );
Akron941c1a62016-02-23 17:41:41 +0100614 $main::LAST_STOP = $new;
615 };
616
617 # Create and parse new document
618 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100619
Akron7d4cdd82016-08-17 21:39:45 +0200620 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200621 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100622
Akron11c80302016-03-18 19:44:43 +0100623 # Delete cache file
624 unlink($cache_file) if $cache_delete;
625
Akron5f51d422016-08-16 16:26:43 +0200626 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200627 exit;
Akron81500102017-04-07 20:45:44 +0200628};
629
Nils Diewald59094f22014-11-05 18:20:50 +0000630
Akrone10ad322016-02-27 10:54:26 +0100631# Extract XML files
Akron81500102017-04-07 20:45:44 +0200632if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100633
Akron7d4cdd82016-08-17 21:39:45 +0200634 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200635 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100636
Akron7d4cdd82016-08-17 21:39:45 +0200637 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100638 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200639 $log->error("Unzip is not installed or incompatible.");
640 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100641 };
642
Akronb0c88db2016-06-29 16:33:18 +0200643 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200644 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200645
Akron651cb8d2016-08-16 21:44:49 +0200646 my $prefix = 1;
647
Akron03b24db2016-08-16 20:54:32 +0200648 # No sigles given
649 unless (@sigle) {
650
651 # Get files
652 foreach ($archive->list_texts) {
653
654 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200655 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200656
657 # TODO: Make this OS independent
658 push @sigle, join '/', $corpus, $doc, $text;
659 };
Akron20807582016-10-26 17:11:34 +0200660 }
661
662 # Check sigle for doc sigles
663 else {
664 my @new_sigle;
665
666 my $prefix_check = 0;
667
668 # Iterate over all sigle
669 foreach (@sigle) {
670
671 # Sigle is a doc sigle
672 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200673
Akron60a8caa2017-02-17 21:51:27 +0100674 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200675 # Check if a prefix is needed
676 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100677
678 if ($prefix = $archive->check_prefix) {
679 print " with prefix ...";
680 };
Akron20807582016-10-26 17:11:34 +0200681 $prefix_check = 1;
682 };
683
Akron60a8caa2017-02-17 21:51:27 +0100684 print "\n";
685
Akron20807582016-10-26 17:11:34 +0200686 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200687 my $path = ($prefix ? './' : '') . $_;
688
689 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200690 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200691 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200692 ) ? '' : 'not '
693 );
694 print "extracted.\n";
695 }
Akron60a8caa2017-02-17 21:51:27 +0100696
697 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200698 else {
699 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100700
701 unless ($prefix_check) {
702
703 if ($prefix = $archive->check_prefix) {
704 print " with prefix ...";
705 };
706 $prefix_check = 1;
707 };
Akron20807582016-10-26 17:11:34 +0200708 };
709 };
710 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200711 };
712
Akrone10ad322016-02-27 10:54:26 +0100713 # Iterate over all given sigles and extract
714 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100715
Akron2812ba22016-10-28 21:55:59 +0200716 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200717
Akron03b24db2016-08-16 20:54:32 +0200718 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200719 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100720
Akron20807582016-10-26 17:11:34 +0200721 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200722 ($prefix ? './' : '') . $_, $output
723 ) ? '' : 'not '
724 );
Akrone10ad322016-02-27 10:54:26 +0100725 print "extracted.\n";
726 };
Akronb0c88db2016-06-29 16:33:18 +0200727 }
Akron7d4cdd82016-08-17 21:39:45 +0200728
729 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200730 else {
731 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200732 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100733 };
734}
735
Akron81500102017-04-07 20:45:44 +0200736
Akron941c1a62016-02-23 17:41:41 +0100737# Process an archive
738elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000739
Akron81500102017-04-07 20:45:44 +0200740 my $archive_output;
741
742 # First extract, then archive
743 if (defined $extract_dir) {
744
745 # Create new archive object
746 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
747
748 # Check zip capabilities
749 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200750 $log->error("Unzip is not installed or incompatible.");
751 exit 1;
Akron81500102017-04-07 20:45:44 +0200752 };
753
754 # Add further annotation archived
755 $archive->attach($_) foreach @input[1..$#input];
756
757 # Create a temporary directory
758 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200759 $extract_dir = tempdir(CLEANUP => 0);
760 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200761 };
762
Akron63f20d42017-04-10 23:40:29 +0200763 # Add some random extra to avoid clashes with multiple archives
764 $extract_dir = catdir($extract_dir, random_string('cccccc'));
765
766 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200767 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200768 @input = ($extract_dir);
769 }
770 else {
771 $log->error('Unable to extract from primary archive ' . $input[0] .
772 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200773 exit 1;
Akron81500102017-04-07 20:45:44 +0200774 };
775 }
776
777 # Can't create archive object
778 else {
779 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200780 exit 1;
Akron81500102017-04-07 20:45:44 +0200781 };
782 };
783
Akrone1dbc382016-07-08 22:24:52 +0200784 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100785
Akron7d4cdd82016-08-17 21:39:45 +0200786 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100787 my $pool = Parallel::ForkManager->new($jobs);
788
Akron7d4cdd82016-08-17 21:39:45 +0200789 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100790 my $iter = 1; # Current text in process
791
Akronda3097e2017-04-23 19:53:57 +0200792 my $tar_archive;
793 my $output_dir = $output;
794 my $tar_fh;
795
796 # Initialize tar archive
797 if ($to_tar) {
798 $tar_archive = Archive::Tar::Builder->new(
799 ignore_errors => 1
800 );
801
802 # Set output name
803 my $tar_file = $output;
804 unless ($tar_file =~ /\.tar$/) {
805 $tar_file .= '.tar';
806 };
807
808 # Initiate the tar file
809 print "Writing to file $tar_file\n";
810 $tar_fh = IO::File->new($tar_file, 'w');
811 $tar_fh->binmode(1);
812
813 # Set handle
814 $tar_archive->set_handle($tar_fh);
815
816 # Output to temporary directory
817 $output_dir = File::Temp->newdir;
818 };
819
Akron941c1a62016-02-23 17:41:41 +0100820 # Report on fork message
821 $pool->run_on_finish (
822 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200823 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100824 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200825
Akron08385f62016-03-22 20:37:04 +0100826 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200827 ($iter++) . "/$count]" .
828 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200829 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200830
831 if (!$code && $to_tar && $data->[2]) {
832 my $filename = $data->[2];
833
834 # Lock filehandle
835 if (flock($tar_fh, LOCK_EX)) {
836
837 # Archive and remove file
838 $tar_archive->archive($filename);
839 unlink $filename;
840
841 # Unlock filehandle
842 flock($tar_fh, LOCK_UN);
843 }
844 else {
845 $log->warn("Unable to add $filename to archive");
846 };
847 };
848
Akron4c0cf312016-10-15 16:42:09 +0200849 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100850 }
851 );
852
853 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200854 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100855 print "Reading data ...\n";
856
Akron7d4cdd82016-08-17 21:39:45 +0200857 # unless (Cache::FastMmap->new(
858 # share_file => $cache_file,
859 # cache_size => $cache_size,
860 # init_file => $cache_init
861 # )) {
862 # print "Unable to intialize cache '$cache_file'\n\n";
863 # exit(1);
864 # };
Akron11c80302016-03-18 19:44:43 +0100865
Akron486f9ab2017-04-22 23:25:19 +0200866
Akron941c1a62016-02-23 17:41:41 +0100867 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100868 if (-d $input[0]) {
869 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100870 my @dirs;
871 my $dir;
872
Akron7d4cdd82016-08-17 21:39:45 +0200873 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100874 while (1) {
875 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200876 push @dirs, $dir;
877 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100878 };
879 last unless $it->next;
880 };
881
882 print "Start processing ...\n";
883 $t = Benchmark->new;
884 $count = scalar @dirs;
885
886 DIRECTORY_LOOP:
887 for (my $i = 0; $i < $count; $i++) {
888
Akrone1dbc382016-07-08 22:24:52 +0200889 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200890 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200891 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200892 );
Akron941c1a62016-02-23 17:41:41 +0100893
894 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200895 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200896
Akron13d56622016-10-31 14:54:49 +0100897 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200898 $pool->finish(
899 0,
Akronda3097e2017-04-23 19:53:57 +0200900 [
901 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
902 undef,
903 $filename
904 ]
Akron486f9ab2017-04-22 23:25:19 +0200905 );
Akron3ec48972016-08-17 23:24:52 +0200906 }
907 else {
Akron4c0cf312016-10-15 16:42:09 +0200908 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200909 };
Akron941c1a62016-02-23 17:41:41 +0100910 };
911 }
912
913 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200914 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200915
Akron941c1a62016-02-23 17:41:41 +0100916 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200917 $log->error("Unzip is not installed or incompatible.");
918 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100919 };
920
Akron08385f62016-03-22 20:37:04 +0100921 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200922 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100923
Akron941c1a62016-02-23 17:41:41 +0100924 print "Start processing ...\n";
925 $t = Benchmark->new;
926 my @dirs = $archive->list_texts;
927 $count = scalar @dirs;
928
929 ARCHIVE_LOOP:
930 for (my $i = 0; $i < $count; $i++) {
931
932 # Split path information
933 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
934
Akrone1dbc382016-07-08 22:24:52 +0200935 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200936 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200937 get_file_name(
938 catfile($corpus, $doc, $text)
939 . '.json' . ($gzip ? '.gz' : '')
940 )
Akrone1dbc382016-07-08 22:24:52 +0200941 );
Akron941c1a62016-02-23 17:41:41 +0100942
943 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200944 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100945
Akron4c0cf312016-10-15 16:42:09 +0200946 # Create temporary file
947 $temp = File::Temp->newdir;
948
Akronbdf434a2016-10-24 17:42:07 +0200949 # TODO: Check if $filename exist at the beginning,
950 # because extraction can be horrible slow!
951
Akron941c1a62016-02-23 17:41:41 +0100952 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200953 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100954
Akron7d4cdd82016-08-17 21:39:45 +0200955 # Create corpus directory
956 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100957
Akron7d4cdd82016-08-17 21:39:45 +0200958 # Temporary directory
959 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100960
Akron7d4cdd82016-08-17 21:39:45 +0200961 # Write file
Akron13d56622016-10-31 14:54:49 +0100962 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200963
Akron4c0cf312016-10-15 16:42:09 +0200964 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100965 $pool->finish(
966 0,
Akronda3097e2017-04-23 19:53:57 +0200967 [
968 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
969 $temp,
970 $filename
971 ]
Akron13d56622016-10-31 14:54:49 +0100972 );
973 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200974 }
975 else {
Akron4c0cf312016-10-15 16:42:09 +0200976 # Delete temporary file
977 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200978 };
Akron941c1a62016-02-23 17:41:41 +0100979 }
Akron7d4cdd82016-08-17 21:39:45 +0200980
981 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100982 else {
Akron4c0cf312016-10-15 16:42:09 +0200983 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100984 };
985 };
986 }
987
988 else {
989 print "Input is neither a directory nor an archive.\n\n";
990 };
991
992 $pool->wait_all_children;
993
Akron11c80302016-03-18 19:44:43 +0100994 # Delete cache file
995 unlink($cache_file) if $cache_delete;
996
Akronda3097e2017-04-23 19:53:57 +0200997 # Close tar filehandle
998 if ($to_tar && $tar_fh) {
999 $tar_archive->finish;
1000 $tar_fh->close;
1001 print "Wrote to tar archive.\n";
1002 };
1003
Akron63f20d42017-04-10 23:40:29 +02001004 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001005 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001006};
Akron941c1a62016-02-23 17:41:41 +01001007
Nils Diewald2db9ad02013-10-29 19:26:43 +00001008
Akron63f20d42017-04-10 23:40:29 +02001009# Cleanup temporary extraction directory
1010if ($extract_dir) {
1011 my $objects = remove_tree($extract_dir, { safe => 1 });
1012 print "Removed directory $extract_dir with $objects objects.\n";
1013};
1014
1015
1016print "\n";
1017
Nils Diewald2db9ad02013-10-29 19:26:43 +00001018__END__
Akron941c1a62016-02-23 17:41:41 +01001019
1020=pod
1021
1022=encoding utf8
1023
1024=head1 NAME
1025
Akronf7ad89e2016-03-16 18:22:47 +01001026korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001027
1028
1029=head1 SYNOPSIS
1030
Akrona76d8352016-10-27 16:27:32 +02001031 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001032
Akron2fd402b2016-10-27 21:26:48 +02001033
Akron941c1a62016-02-23 17:41:41 +01001034=head1 DESCRIPTION
1035
1036L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1037compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001038The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001039
1040
1041=head1 INSTALLATION
1042
1043The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1044
Akronaf386982016-10-12 00:33:25 +02001045 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001046
Akronc13a1702016-03-15 19:33:14 +01001047In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001048be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001049Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001050In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001051
1052=head1 ARGUMENTS
1053
Akrona76d8352016-10-27 16:27:32 +02001054 $ korapxml2krill -z --input <directory> --output <filename>
1055
1056Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001057It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001058
Akron941c1a62016-02-23 17:41:41 +01001059=over 2
1060
1061=item B<archive>
1062
Akron081639e2017-04-21 19:01:39 +02001063 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001064
Akron2fd402b2016-10-27 21:26:48 +02001065Converts an archive of KorAP-XML documents. It expects a directory
1066(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001067
1068=item B<extract>
1069
Akrona76d8352016-10-27 16:27:32 +02001070 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1071
1072Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001073
Akron63f20d42017-04-10 23:40:29 +02001074=item B<serial>
1075
1076 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1077
1078Convert archives sequentially. The inputs are not merged but treated
1079as they are (so they may be premerged or globs).
1080the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001081are created based on the archive name. In case the C<--to-tar> flag is given,
1082the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001083
1084
Akron941c1a62016-02-23 17:41:41 +01001085=back
1086
1087
1088=head1 OPTIONS
1089
1090=over 2
1091
Akrona76d8352016-10-27 16:27:32 +02001092=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001093
Akrona76d8352016-10-27 16:27:32 +02001094Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001095
Akron7606afa2016-10-25 16:23:49 +02001096Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001097document, while C<archive> expects a KorAP-XML corpus folder or a zip
1098file to batch process multiple files.
1099C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001100
Akrona76d8352016-10-27 16:27:32 +02001101C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001102that the first archive listed contains all primary data files
1103and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001104
Akron7606afa2016-10-25 16:23:49 +02001105 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001106
Akron821db3d2017-04-06 21:19:31 +02001107Input may also be defined using BSD glob wildcards.
1108
1109 -i 'file/news*.zip'
1110
1111The extended input array will be sorted in length order, so the shortest
1112path needs to contain all primary data files and all meta data files.
1113
Akron0c3e3752016-06-28 15:55:53 +02001114(The directory structure follows the base directory format,
1115that may include a C<.> root folder.
1116In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001117need to be passed with a hash sign in front of the archive's name.
1118This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001119
Akron7606afa2016-10-25 16:23:49 +02001120To support zip files, a version of C<unzip> needs to be installed that is
1121compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001122
Akron7606afa2016-10-25 16:23:49 +02001123B<The root folder switch using the hash sign is experimental and
1124may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001125
Akron63f20d42017-04-10 23:40:29 +02001126=item B<--input-base|-ib> <directory>
1127
1128The base directory for inputs.
1129
1130
Akron941c1a62016-02-23 17:41:41 +01001131=item B<--output|-o> <directory|file>
1132
1133Output folder for archive processing or
1134document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001135writes to C<STDOUT> by default
1136(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001137
1138=item B<--overwrite|-w>
1139
1140Overwrite files that already exist.
1141
Akron3741f8b2016-12-21 19:55:21 +01001142=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001143
1144Define the default tokenization by specifying
1145the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001146of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001147
Akron3741f8b2016-12-21 19:55:21 +01001148
1149=item B<--base-sentences|-bs> <foundry>#<layer>
1150
1151Define the layer for base sentences.
1152If given, this will be used instead of using C<Base#Sentences>.
1153Currently C<DeReKo#Structure> is the only additional layer supported.
1154
1155 Defaults to unset.
1156
1157
1158=item B<--base-paragraphs|-bp> <foundry>#<layer>
1159
1160Define the layer for base paragraphs.
1161If given, this will be used instead of using C<Base#Paragraphs>.
1162Currently C<DeReKo#Structure> is the only additional layer supported.
1163
1164 Defaults to unset.
1165
1166
Akron41ac10b2017-02-08 22:47:25 +01001167=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1168
1169Define the layer for base pagebreaks.
1170Currently C<DeReKo#Structure> is the only layer supported.
1171
1172 Defaults to unset.
1173
1174
Akron941c1a62016-02-23 17:41:41 +01001175=item B<--skip|-s> <foundry>[#<layer>]
1176
Akronf7ad89e2016-03-16 18:22:47 +01001177Skip specific annotations by specifying the foundry
1178(and optionally the layer with a C<#>-prefix),
1179e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001180Can be set multiple times.
1181
Akronc13a1702016-03-15 19:33:14 +01001182=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001183
Akronf7ad89e2016-03-16 18:22:47 +01001184Convert specific annotations by specifying the foundry
1185(and optionally the layer with a C<#>-prefix),
1186e.g. C<Mate> or C<Mate#Morpho>.
1187Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001188
1189=item B<--primary|-p>
1190
Akronc13a1702016-03-15 19:33:14 +01001191Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001192Can be flagged using C<--no-primary> as well.
1193This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001194
1195=item B<--jobs|-j>
1196
1197Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001198for archive processing.
Akron11c80302016-03-18 19:44:43 +01001199Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001200
1201If C<sequential-extraction> is not set to false, this will
1202also apply to extraction.
1203
Akronc11f7982017-02-21 21:20:14 +01001204Pass -1, and the value will be set automatically to 5
1205times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001206This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001207
Akron9ec88872017-04-12 16:29:06 +02001208=item B<--sequential-extraction|-se>
1209
1210Flag to indicate, if the C<jobs> value also applies to extraction.
1211Some systems may have problems with extracting multiple archives
1212to the same folder at the same time.
1213Can be flagged using C<--no-sequential-extraction> as well.
1214Defaults to C<false>.
1215
Akron35db6e32016-03-17 22:42:22 +01001216=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001217
Akron35db6e32016-03-17 22:42:22 +01001218Define the metadata parser to use. Defaults to C<I5>.
1219Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1220This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001221
1222=item B<--pretty|-y>
1223
Akronc13a1702016-03-15 19:33:14 +01001224Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001225This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001226
1227=item B<--gzip|-z>
1228
Akronf7ad89e2016-03-16 18:22:47 +01001229Compress the output.
1230Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001231
Akron11c80302016-03-18 19:44:43 +01001232=item B<--cache|-c>
1233
1234File to mmap a cache (using L<Cache::FastMmap>).
1235Defaults to C<korapxml2krill.cache> in the calling directory.
1236
1237=item B<--cache-size|-cs>
1238
1239Size of the cache. Defaults to C<50m>.
1240
1241=item B<--cache-init|-ci>
1242
1243Initialize cache file.
1244Can be flagged using C<--no-cache-init> as well.
1245Defaults to C<true>.
1246
1247=item B<--cache-delete|-cd>
1248
1249Delete cache file after processing.
1250Can be flagged using C<--no-cache-delete> as well.
1251Defaults to C<true>.
1252
Akron636aa112017-04-07 18:48:56 +02001253=item B<--config|-cfg>
1254
1255Configure the parameters of your call in a file
1256of key-value pairs with whitespace separator
1257
1258 overwrite 1
1259 token DeReKo#Structure
1260 ...
1261
1262Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001263C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001264C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001265C<output>,
1266C<temp-extract>, C<sequential-extraction>,
1267C<base-sentences>, C<base-paragraphs>,
1268C<base-pagebreaks>,
1269C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001270(semicolon separated), C<anno> (semicolon separated).
1271
Akron81500102017-04-07 20:45:44 +02001272=item B<--temporary-extract|-te>
1273
1274Only valid for the C<archive> command.
1275
1276This will first extract all files into a
1277directory and then will archive.
1278If the directory is given as C<:temp:>,
1279a temporary directory is used.
1280This is especially useful to avoid
1281massive unzipping and potential
1282network latency.
Akron636aa112017-04-07 18:48:56 +02001283
Akrone10ad322016-02-27 10:54:26 +01001284=item B<--sigle|-sg>
1285
Akron20807582016-10-26 17:11:34 +02001286Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001287Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001288I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001289Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001290In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001291On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001292
Akron941c1a62016-02-23 17:41:41 +01001293=item B<--log|-l>
1294
1295The L<Log4perl> log level, defaults to C<ERROR>.
1296
1297=item B<--help|-h>
1298
1299Print this document.
1300
1301=item B<--version|-v>
1302
1303Print version information.
1304
1305=back
1306
Akronc13a1702016-03-15 19:33:14 +01001307=head1 ANNOTATION SUPPORT
1308
1309L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1310developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1311The base foundry with paragraphs, sentences, and the text element are mandatory for
1312L<Krill|https://github.com/KorAP/Krill>.
1313
Akron821db3d2017-04-06 21:19:31 +02001314 Base
1315 #Paragraphs
1316 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001317
Akron821db3d2017-04-06 21:19:31 +02001318 Connexor
1319 #Morpho
1320 #Phrase
1321 #Sentences
1322 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001323
Akron821db3d2017-04-06 21:19:31 +02001324 CoreNLP
1325 #Constituency
1326 #Morpho
1327 #NamedEntities
1328 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001329
Akronce125b62017-06-19 11:54:36 +02001330 CMC
1331 #Morpho
1332
Akron821db3d2017-04-06 21:19:31 +02001333 DeReKo
1334 #Structure
Akronc13a1702016-03-15 19:33:14 +01001335
Akron821db3d2017-04-06 21:19:31 +02001336 DRuKoLa
1337 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001338
Akron821db3d2017-04-06 21:19:31 +02001339 Glemm
1340 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001341
Akron821db3d2017-04-06 21:19:31 +02001342 Malt
1343 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001344
Akron821db3d2017-04-06 21:19:31 +02001345 MarMoT
1346 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001347
Akron821db3d2017-04-06 21:19:31 +02001348 Mate
1349 #Dependency
1350 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001351
Akron821db3d2017-04-06 21:19:31 +02001352 MDParser
1353 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001354
Akron821db3d2017-04-06 21:19:31 +02001355 OpenNLP
1356 #Morpho
1357 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001358
Akron821db3d2017-04-06 21:19:31 +02001359 Sgbr
1360 #Lemma
1361 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001362
Akron821db3d2017-04-06 21:19:31 +02001363 TreeTagger
1364 #Morpho
1365 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001366
Akron821db3d2017-04-06 21:19:31 +02001367 XIP
1368 #Constituency
1369 #Morpho
1370 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001371
Akronc13a1702016-03-15 19:33:14 +01001372
1373More importers are in preparation.
1374New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1375See the built-in annotation importers as examples.
1376
Akron941c1a62016-02-23 17:41:41 +01001377=head1 AVAILABILITY
1378
1379 https://github.com/KorAP/KorAP-XML-Krill
1380
1381
1382=head1 COPYRIGHT AND LICENSE
1383
Akron3ec0a1c2017-01-18 14:41:55 +01001384Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001385
Akron941c1a62016-02-23 17:41:41 +01001386Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001387
Akrona76d8352016-10-27 16:27:32 +02001388Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001389
1390L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1391Corpus Analysis Platform at the
1392L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1393member of the
1394L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1395
1396This program is free software published under the
1397L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1398
1399=cut