blob: 8de1a1adec679fd7908de8a1fdb2c6acb9f66a7b [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akron941c1a62016-02-23 17:41:41 +0100132# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100133
Akron5fdc7e12018-07-19 12:37:48 +0200134our $LAST_CHANGE = '2018/07/19';
Akron941c1a62016-02-23 17:41:41 +0100135our $LOCAL = $FindBin::Bin;
136our $VERSION_MSG = <<"VERSION";
137Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
138VERSION
139
Akron63f20d42017-04-10 23:40:29 +0200140# Prototypes
141sub get_file_name_from_glob($);
142sub get_file_name($);
143
Akron941c1a62016-02-23 17:41:41 +0100144# Parse comand
145my $cmd;
146our @ARGV;
147if ($ARGV[0] && index($ARGV[0], '-') != 0) {
148 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100149};
Akron63f20d42017-04-10 23:40:29 +0200150my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100151
Akron5f51d422016-08-16 16:26:43 +0200152my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100153my $text;
Akrone10ad322016-02-27 10:54:26 +0100154
Akron941c1a62016-02-23 17:41:41 +0100155# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000156GetOptions(
Akron08385f62016-03-22 20:37:04 +0100157 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200158 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100159 'output|o=s' => \(my $output),
160 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100161 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200162 'token|t=s' => \(my $token_base),
163 'base-sentences|bs=s' => \(my $base_sentences),
164 'base-paragraphs|bp=s' => \(my $base_paragraphs),
165 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100166 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200167 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100168 'skip|s=s' => \@skip,
169 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200170 'cache|c=s' => \(my $cache_file),
171 'config|cfg=s' => \(my $cfg_file),
172 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200173 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100174 'primary|p!' => \(my $primary),
175 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200176 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200177 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200178 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200179 'cache-size|cs=s' => \(my $cache_size),
180 'cache-delete|cd!' => \(my $cache_delete),
181 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100182 'help|h' => sub {
183 pod2usage(
184 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200185 -verbose => 99,
186 -msg => $VERSION_MSG,
187 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100188 );
189 },
190 'version|v' => sub {
191 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200192 -verbose => 0,
193 -msg => $VERSION_MSG,
194 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100195 )
196 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000197);
198
Akron63f20d42017-04-10 23:40:29 +0200199
Akron636aa112017-04-07 18:48:56 +0200200# Load from configuration
201if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200202 my %config;
203
204 Config::Simple->import_from($cfg_file, \%config);
205
206 # Overwrite
207 if (!defined($overwrite) && defined $config{overwrite}) {
208 $overwrite = $config{overwrite};
209 };
210
211 # Gzip
212 if (!defined($gzip) && defined $config{gzip}) {
213 $gzip = $config{gzip};
214 };
215
216 # Jobs
217 if (!defined($jobs) && defined $config{jobs}) {
218 $jobs = $config{jobs};
219 };
220
Akron63f20d42017-04-10 23:40:29 +0200221 # Input root base directory
222 if (!defined($input_base) && defined $config{'input-base'}) {
223 $input_base = $config{'input-base'};
224 };
225
Akron81500102017-04-07 20:45:44 +0200226 # temporary-extract
227 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
228 $extract_dir = $config{'temporary-extract'};
229 };
230
Akron636aa112017-04-07 18:48:56 +0200231 # Token base
232 if (!defined($token_base) && defined $config{token}) {
233 $token_base = $config{token};
234 };
235
236 # Cache file
237 if (!defined($cache_file) && defined $config{cache}) {
238 $cache_file = $config{cache};
239 };
240
241 # Cache size
242 if (!defined($cache_size) && defined $config{'cache-size'}) {
243 $cache_size = $config{'cache-size'};
244 };
245
246 # Cache delete
247 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
248 $cache_delete = $config{'cache-delete'} ;
249 };
250
251 # Cache init
252 if (!(defined $cache_init) && defined $config{'cache-init'}) {
253 $cache_init = $config{'cache-init'} ;
254 };
255
Akron9ec88872017-04-12 16:29:06 +0200256 # Jobs for extraction
257 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
258 $sequential_extraction = $config{'sequential-extraction'} ;
259 };
260
Akron636aa112017-04-07 18:48:56 +0200261 # Meta
262 if (!(defined $meta) && defined $config{'meta'}) {
263 $meta = $config{'meta'} ;
264 };
265
266 # Output
267 if (!(defined $output) && defined $config{'output'}) {
268 $output = $config{'output'} ;
269 };
270
271 # Base-sentences
272 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
273 $base_sentences = $config{'base-sentences'} ;
274 };
275
276 # Base-paragraphs
277 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
278 $base_paragraphs = $config{'base-paragraphs'} ;
279 };
280
281 # Base-pagebreaks
282 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
283 $base_pagebreaks = $config{'base-pagebreaks'} ;
284 };
285
Akron081639e2017-04-21 19:01:39 +0200286 # Write to tar
287 if (!(defined $to_tar) && defined $config{'to-tar'}) {
288 $to_tar = $config{'to-tar'} ;
289 };
290
Akron636aa112017-04-07 18:48:56 +0200291 # Log
292 if (!(defined $log_level) && defined $config{'log'}) {
293 $log_level = $config{'log'} ;
294 };
295
296 # Skip
297 if (!scalar(@skip) && defined $config{'skip'}) {
298 @skip = split /\s*;\s*/, $config{'skip'} ;
299 };
300
301 # Sigle
302 if (!scalar(@sigle) && defined $config{'sigle'}) {
303 @sigle = split /\s*;\s*/, $config{'sigle'} ;
304 };
305
306 # Anno
307 if (!scalar(@anno) && defined $config{'anno'}) {
308 @anno = split /\s*;\s*/, $config{'anno'} ;
309 };
310};
311
Akron63f20d42017-04-10 23:40:29 +0200312
Akron636aa112017-04-07 18:48:56 +0200313# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200314$token_base //= 'OpenNLP#tokens';
315$cache_file //= 'korapxml2krill.cache';
316$cache_size //= '50m';
317$jobs //= 0;
318$cache_delete //= 1;
319$cache_init //= 1;
320$sequential_extraction //= 0;
321$log_level //= 'ERROR';
322$base_sentences //= '';
323$base_paragraphs //= '';
324$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200325
Akron821db3d2017-04-06 21:19:31 +0200326$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100327$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100328$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100329
Akron63f20d42017-04-10 23:40:29 +0200330
331# Initialize log4perl object
332Log::Log4perl->init({
333 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
334 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
335 'log4perl.appender.STDERR.layout' => 'PatternLayout',
336 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
337});
338
339my $log = Log::Log4perl->get_logger('main');
340
341
342print "Reading config from $cfg_file\n" if $cfg_file;
343
344
Akron941c1a62016-02-23 17:41:41 +0100345my %ERROR_HASH = (
346 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200347 -verbose => 99,
348 -msg => $VERSION_MSG,
349 -output => '-',
350 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100351);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352
Akron941c1a62016-02-23 17:41:41 +0100353# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100354pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000355
Akrone1dbc382016-07-08 22:24:52 +0200356# Gzip has no effect, if no output is given
357pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000358
Akronc11f7982017-02-21 21:20:14 +0100359
Akron636aa112017-04-07 18:48:56 +0200360if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100361 state $cores = Sys::Info->new->device('CPU')->count;
362 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200363 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100364};
365
Akron821db3d2017-04-06 21:19:31 +0200366
Akron63f20d42017-04-10 23:40:29 +0200367# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200368if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200369
Akron486f9ab2017-04-22 23:25:19 +0200370 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200371 $log->error("Directory '$output' does not exist.");
372 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200373 };
374
375 # Remove all inputs
376 my $remove_next = 0;
377 @keep_argv = @{c(@keep_argv)->grep(
378 sub {
379 # Input flag
380 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
381 $remove_next = 1;
382 return 0;
383 }
384
385 # input value
386 elsif ($remove_next) {
387 $remove_next = 0;
388 return 0;
389 };
390
391 # Pass parameter
392 return 1;
393 }
394 )->to_array};
395
396
397 # Iterate over all inputs
398 foreach (@input) {
399
Akron081639e2017-04-21 19:01:39 +0200400 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200401 my $new_out = catdir($output, get_file_name_from_glob($_));
402
Akron486f9ab2017-04-22 23:25:19 +0200403 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200404 unless ($to_tar) {
405 if (make_path($new_out) == 0 && !-d $new_out) {
406 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200407 exit 1;
Akron081639e2017-04-21 19:01:39 +0200408 };
Akron63f20d42017-04-10 23:40:29 +0200409 };
410
411 # Create archive command
412 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
413 print "Start serial processing of $_ to $new_out\n";
414
415 # Start archiving
416 system @archive_cmd;
417 };
418
Akron3abc03e2017-06-29 16:23:35 +0200419 exit;
Akron63f20d42017-04-10 23:40:29 +0200420};
421
Akrone1dbc382016-07-08 22:24:52 +0200422my %skip;
423$skip{lc($_)} = 1 foreach @skip;
424
425my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100426push(@layers, ['Base', 'Sentences']) unless $base_sentences;
427push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200428
429# Connexor
430push(@layers, ['Connexor', 'Morpho']);
431push(@layers, ['Connexor', 'Syntax']);
432push(@layers, ['Connexor', 'Phrase']);
433push(@layers, ['Connexor', 'Sentences']);
434
435# CoreNLP
436push(@layers, ['CoreNLP', 'NamedEntities']);
437push(@layers, ['CoreNLP', 'Sentences']);
438push(@layers, ['CoreNLP', 'Morpho']);
439push(@layers, ['CoreNLP', 'Constituency']);
440
Akronce125b62017-06-19 11:54:36 +0200441# CMC
442push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100443
Akrone1dbc382016-07-08 22:24:52 +0200444# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100445my @dereko_attr = ();
446if ($base_sentences eq 'dereko#structure') {
447 push @dereko_attr, 'sentences';
448};
449if ($base_paragraphs eq 'dereko#structure') {
450 push @dereko_attr, 'paragraphs';
451};
Akron636bd9c2017-02-09 17:13:00 +0100452
Akron41ac10b2017-02-08 22:47:25 +0100453if ($base_pagebreaks eq 'dereko#structure') {
454 push @dereko_attr, 'pagebreaks';
455};
456
457if ($dereko_attr[0]) {
458 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100459}
460else {
461 push(@layers, ['DeReKo', 'Structure']);
462};
Akrone1dbc382016-07-08 22:24:52 +0200463
464# Glemm
465push(@layers, ['Glemm', 'Morpho']);
466
Akron4c679192018-01-16 17:41:49 +0100467# LWC
468push(@layers, ['LWC', 'Dependency']);
469
Akrone1dbc382016-07-08 22:24:52 +0200470# Malt
471push(@layers, ['Malt', 'Dependency']);
472
473# MDParser
474push(@layers, ['MDParser', 'Dependency']);
475
476# Mate
477push(@layers, ['Mate', 'Morpho']);
478push(@layers, ['Mate', 'Dependency']);
479
480# OpenNLP
481push(@layers, ['OpenNLP', 'Morpho']);
482push(@layers, ['OpenNLP', 'Sentences']);
483
484# Schreibgebrauch
485push(@layers, ['Sgbr', 'Lemma']);
486push(@layers, ['Sgbr', 'Morpho']);
487
488# TreeTagger
489push(@layers, ['TreeTagger', 'Morpho']);
490push(@layers, ['TreeTagger', 'Sentences']);
491
492# XIP
493push(@layers, ['XIP', 'Morpho']);
494push(@layers, ['XIP', 'Constituency']);
495push(@layers, ['XIP', 'Sentences']);
496push(@layers, ['XIP', 'Dependency']);
497
Akron4fa37c32017-01-20 14:43:10 +0100498# DRuKoLa
499push(@layers, ['DRuKoLa', 'Morpho']);
500
Akron3bd942f2017-02-20 20:09:14 +0100501# Marmot
502push(@layers, ['MarMoT', 'Morpho']);
503
Akron4fa37c32017-01-20 14:43:10 +0100504
Akrone1dbc382016-07-08 22:24:52 +0200505# Check filters
506my @filtered_anno;
507if ($skip{'#all'}) {
508 foreach (@anno) {
509 push @filtered_anno, [ split('#', $_) ];
510 };
511}
512
513# Add all annotations that are not skipped
514else {
515 # Add to index file - respect skipping
516 foreach my $info (@layers) {
517 # Skip if Foundry or Foundry#Layer should be skipped
518 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
519 push @filtered_anno, $info;
520 };
521 };
522};
523
524# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200525my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
526
527# Remove file extension
528$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200529
530# TODO: This should not be initialized for batch
531my $cache = Cache::FastMmap->new(
532 share_file => $cache_file,
533 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200534 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200535);
536
Akron03b24db2016-08-16 20:54:32 +0200537# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200538my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200539 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200540 meta_type => $meta,
541 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200542 foundry => $token_base_foundry,
543 layer => $token_base_layer,
544 gzip => $gzip,
545 log => $log,
546 primary => $primary,
547 pretty => $pretty,
548 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200549);
550
Akron941c1a62016-02-23 17:41:41 +0100551# Get file name based on path information
552sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100553 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200554 if (-d $i) {
555 $i =~ s![^\/]+$!!;
556 };
Akron941c1a62016-02-23 17:41:41 +0100557 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200558
559 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200560 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100561 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100562 $file =~ tr/\//-/;
563 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200564 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100565 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000566};
567
Akron63f20d42017-04-10 23:40:29 +0200568
569sub get_file_name_from_glob ($) {
570 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200571 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200572 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
573 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
574 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
575 $glob =~ s/^-//; # Clean beginning
576 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200577 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200578 return $glob;
579};
580
581
Akrone10ad322016-02-27 10:54:26 +0100582# Convert sigle to path construct
583s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
584
Akron7d4cdd82016-08-17 21:39:45 +0200585if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200586 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200587 $log->error("Directory '$output' does not exist.");
588 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200589 };
590};
591
Akron63f20d42017-04-10 23:40:29 +0200592
593# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200594if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200595
Akron821db3d2017-04-06 21:19:31 +0200596 my @new_input = ();
597
598 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200599 foreach my $wild_card (@input) {
600
601 # Prefix with input root
602 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
603
604 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200605 };
606
Akron63f20d42017-04-10 23:40:29 +0200607 # Sort files by length
608 @input = sort { length($a) <=> length($b) } @new_input;
609
610 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200611};
612
613
Akron941c1a62016-02-23 17:41:41 +0100614# Process a single file
615unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100616 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000617
Akron941c1a62016-02-23 17:41:41 +0100618 BEGIN {
619 $main::TIME = Benchmark->new;
620 $main::LAST_STOP = Benchmark->new;
621 };
622
623 sub stop_time {
624 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200625 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100626 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200627 timestr(timediff($new, $main::LAST_STOP)) .
628 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
629 );
Akron941c1a62016-02-23 17:41:41 +0100630 $main::LAST_STOP = $new;
631 };
632
633 # Create and parse new document
634 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100635
Akron7d4cdd82016-08-17 21:39:45 +0200636 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200637 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100638
Akron11c80302016-03-18 19:44:43 +0100639 # Delete cache file
640 unlink($cache_file) if $cache_delete;
641
Akron5f51d422016-08-16 16:26:43 +0200642 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200643 exit;
Akron81500102017-04-07 20:45:44 +0200644};
645
Nils Diewald59094f22014-11-05 18:20:50 +0000646
Akrone10ad322016-02-27 10:54:26 +0100647# Extract XML files
Akron81500102017-04-07 20:45:44 +0200648if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100649
Akrond5643ad2017-07-04 20:27:13 +0200650 # Output is required
651 pod2usage(%ERROR_HASH) unless $output;
652
Akron7d4cdd82016-08-17 21:39:45 +0200653 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200654 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100655
Akron7d4cdd82016-08-17 21:39:45 +0200656 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100657 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200658 $log->error("Unzip is not installed or incompatible.");
659 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100660 };
661
Akronb0c88db2016-06-29 16:33:18 +0200662 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200663 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200664
Akron651cb8d2016-08-16 21:44:49 +0200665 my $prefix = 1;
666
Akron03b24db2016-08-16 20:54:32 +0200667 # No sigles given
668 unless (@sigle) {
669
670 # Get files
671 foreach ($archive->list_texts) {
672
673 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200674 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200675
676 # TODO: Make this OS independent
677 push @sigle, join '/', $corpus, $doc, $text;
678 };
Akron20807582016-10-26 17:11:34 +0200679 }
680
681 # Check sigle for doc sigles
682 else {
683 my @new_sigle;
684
685 my $prefix_check = 0;
686
687 # Iterate over all sigle
688 foreach (@sigle) {
689
690 # Sigle is a doc sigle
691 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200692
Akron60a8caa2017-02-17 21:51:27 +0100693 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200694 # Check if a prefix is needed
695 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100696
697 if ($prefix = $archive->check_prefix) {
698 print " with prefix ...";
699 };
Akron20807582016-10-26 17:11:34 +0200700 $prefix_check = 1;
701 };
702
Akron60a8caa2017-02-17 21:51:27 +0100703 print "\n";
704
Akron20807582016-10-26 17:11:34 +0200705 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200706 my $path = ($prefix ? './' : '') . $_;
707
708 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200709 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200710 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200711 ) ? '' : 'not '
712 );
713 print "extracted.\n";
714 }
Akron60a8caa2017-02-17 21:51:27 +0100715
716 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200717 else {
718 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100719
720 unless ($prefix_check) {
721
722 if ($prefix = $archive->check_prefix) {
723 print " with prefix ...";
724 };
725 $prefix_check = 1;
726 };
Akron20807582016-10-26 17:11:34 +0200727 };
728 };
729 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200730 };
731
Akrone10ad322016-02-27 10:54:26 +0100732 # Iterate over all given sigles and extract
733 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100734
Akron2812ba22016-10-28 21:55:59 +0200735 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200736
Akron03b24db2016-08-16 20:54:32 +0200737 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200738 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100739
Akron20807582016-10-26 17:11:34 +0200740 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200741 ($prefix ? './' : '') . $_, $output
742 ) ? '' : 'not '
743 );
Akrone10ad322016-02-27 10:54:26 +0100744 print "extracted.\n";
745 };
Akronb0c88db2016-06-29 16:33:18 +0200746 }
Akron7d4cdd82016-08-17 21:39:45 +0200747
748 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200749 else {
750 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200751 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100752 };
753}
754
Akron81500102017-04-07 20:45:44 +0200755
Akron941c1a62016-02-23 17:41:41 +0100756# Process an archive
757elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000758
Akron81500102017-04-07 20:45:44 +0200759 my $archive_output;
760
761 # First extract, then archive
762 if (defined $extract_dir) {
763
764 # Create new archive object
765 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
766
767 # Check zip capabilities
768 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200769 $log->error("Unzip is not installed or incompatible.");
770 exit 1;
Akron81500102017-04-07 20:45:44 +0200771 };
772
773 # Add further annotation archived
774 $archive->attach($_) foreach @input[1..$#input];
775
776 # Create a temporary directory
777 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200778 $extract_dir = tempdir(CLEANUP => 0);
779 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200780 };
781
Akron63f20d42017-04-10 23:40:29 +0200782 # Add some random extra to avoid clashes with multiple archives
783 $extract_dir = catdir($extract_dir, random_string('cccccc'));
784
785 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200786 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200787 @input = ($extract_dir);
788 }
789 else {
790 $log->error('Unable to extract from primary archive ' . $input[0] .
791 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200792 exit 1;
Akron81500102017-04-07 20:45:44 +0200793 };
794 }
795
796 # Can't create archive object
797 else {
798 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200799 exit 1;
Akron81500102017-04-07 20:45:44 +0200800 };
801 };
802
Akrone1dbc382016-07-08 22:24:52 +0200803 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100804
Akron7d4cdd82016-08-17 21:39:45 +0200805 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100806 my $pool = Parallel::ForkManager->new($jobs);
807
Akron7d4cdd82016-08-17 21:39:45 +0200808 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100809 my $iter = 1; # Current text in process
810
Akronda3097e2017-04-23 19:53:57 +0200811 my $tar_archive;
812 my $output_dir = $output;
813 my $tar_fh;
814
815 # Initialize tar archive
816 if ($to_tar) {
817 $tar_archive = Archive::Tar::Builder->new(
818 ignore_errors => 1
819 );
820
821 # Set output name
822 my $tar_file = $output;
823 unless ($tar_file =~ /\.tar$/) {
824 $tar_file .= '.tar';
825 };
826
827 # Initiate the tar file
828 print "Writing to file $tar_file\n";
829 $tar_fh = IO::File->new($tar_file, 'w');
830 $tar_fh->binmode(1);
831
832 # Set handle
833 $tar_archive->set_handle($tar_fh);
834
835 # Output to temporary directory
836 $output_dir = File::Temp->newdir;
837 };
838
Akron941c1a62016-02-23 17:41:41 +0100839 # Report on fork message
840 $pool->run_on_finish (
841 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200842 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100843 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200844
Akron08385f62016-03-22 20:37:04 +0100845 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200846 ($iter++) . "/$count]" .
847 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200848 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200849
850 if (!$code && $to_tar && $data->[2]) {
851 my $filename = $data->[2];
852
853 # Lock filehandle
854 if (flock($tar_fh, LOCK_EX)) {
855
Akron9a062ce2017-07-04 19:12:05 +0200856 my $clean_file = fileparse($filename);
857
Akronda3097e2017-04-23 19:53:57 +0200858 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200859 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200860 unlink $filename;
861
862 # Unlock filehandle
863 flock($tar_fh, LOCK_UN);
864 }
865 else {
866 $log->warn("Unable to add $filename to archive");
867 };
868 };
869
Akron4c0cf312016-10-15 16:42:09 +0200870 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100871 }
872 );
873
874 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200875 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100876 print "Reading data ...\n";
877
Akron7d4cdd82016-08-17 21:39:45 +0200878 # unless (Cache::FastMmap->new(
879 # share_file => $cache_file,
880 # cache_size => $cache_size,
881 # init_file => $cache_init
882 # )) {
883 # print "Unable to intialize cache '$cache_file'\n\n";
884 # exit(1);
885 # };
Akron11c80302016-03-18 19:44:43 +0100886
Akron486f9ab2017-04-22 23:25:19 +0200887
Akron941c1a62016-02-23 17:41:41 +0100888 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100889 if (-d $input[0]) {
890 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100891 my @dirs;
892 my $dir;
893
Akron7d4cdd82016-08-17 21:39:45 +0200894 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100895 while (1) {
896 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200897 push @dirs, $dir;
898 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100899 };
900 last unless $it->next;
901 };
902
903 print "Start processing ...\n";
904 $t = Benchmark->new;
905 $count = scalar @dirs;
906
907 DIRECTORY_LOOP:
908 for (my $i = 0; $i < $count; $i++) {
909
Akrone1dbc382016-07-08 22:24:52 +0200910 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200911 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200912 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200913 );
Akron941c1a62016-02-23 17:41:41 +0100914
915 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200916 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200917
Akron13d56622016-10-31 14:54:49 +0100918 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200919 $pool->finish(
920 0,
Akronda3097e2017-04-23 19:53:57 +0200921 [
922 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
923 undef,
924 $filename
925 ]
Akron486f9ab2017-04-22 23:25:19 +0200926 );
Akron3ec48972016-08-17 23:24:52 +0200927 }
928 else {
Akron4c0cf312016-10-15 16:42:09 +0200929 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200930 };
Akron941c1a62016-02-23 17:41:41 +0100931 };
932 }
933
934 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200935 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200936
Akron941c1a62016-02-23 17:41:41 +0100937 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200938 $log->error("Unzip is not installed or incompatible.");
939 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100940 };
941
Akron08385f62016-03-22 20:37:04 +0100942 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200943 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100944
Akron941c1a62016-02-23 17:41:41 +0100945 print "Start processing ...\n";
946 $t = Benchmark->new;
947 my @dirs = $archive->list_texts;
948 $count = scalar @dirs;
949
950 ARCHIVE_LOOP:
951 for (my $i = 0; $i < $count; $i++) {
952
953 # Split path information
954 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
955
Akrone1dbc382016-07-08 22:24:52 +0200956 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200957 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200958 get_file_name(
959 catfile($corpus, $doc, $text)
960 . '.json' . ($gzip ? '.gz' : '')
961 )
Akrone1dbc382016-07-08 22:24:52 +0200962 );
Akron941c1a62016-02-23 17:41:41 +0100963
964 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200965 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100966
Akron4c0cf312016-10-15 16:42:09 +0200967 # Create temporary file
968 $temp = File::Temp->newdir;
969
Akronbdf434a2016-10-24 17:42:07 +0200970 # TODO: Check if $filename exist at the beginning,
971 # because extraction can be horrible slow!
972
Akron941c1a62016-02-23 17:41:41 +0100973 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200974 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100975
Akron7d4cdd82016-08-17 21:39:45 +0200976 # Create corpus directory
977 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100978
Akron7d4cdd82016-08-17 21:39:45 +0200979 # Temporary directory
980 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100981
Akron7d4cdd82016-08-17 21:39:45 +0200982 # Write file
Akron13d56622016-10-31 14:54:49 +0100983 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200984
Akron4c0cf312016-10-15 16:42:09 +0200985 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100986 $pool->finish(
987 0,
Akronda3097e2017-04-23 19:53:57 +0200988 [
989 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
990 $temp,
991 $filename
992 ]
Akron13d56622016-10-31 14:54:49 +0100993 );
994 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200995 }
996 else {
Akron4c0cf312016-10-15 16:42:09 +0200997 # Delete temporary file
998 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200999 };
Akron941c1a62016-02-23 17:41:41 +01001000 }
Akron7d4cdd82016-08-17 21:39:45 +02001001
1002 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001003 else {
Akron4c0cf312016-10-15 16:42:09 +02001004 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001005 };
1006 };
1007 }
1008
1009 else {
1010 print "Input is neither a directory nor an archive.\n\n";
1011 };
1012
1013 $pool->wait_all_children;
1014
Akron11c80302016-03-18 19:44:43 +01001015 # Delete cache file
1016 unlink($cache_file) if $cache_delete;
1017
Akronda3097e2017-04-23 19:53:57 +02001018 # Close tar filehandle
1019 if ($to_tar && $tar_fh) {
1020 $tar_archive->finish;
1021 $tar_fh->close;
1022 print "Wrote to tar archive.\n";
1023 };
1024
Akron63f20d42017-04-10 23:40:29 +02001025 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001026 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001027};
Akron941c1a62016-02-23 17:41:41 +01001028
Nils Diewald2db9ad02013-10-29 19:26:43 +00001029
Akron63f20d42017-04-10 23:40:29 +02001030# Cleanup temporary extraction directory
1031if ($extract_dir) {
1032 my $objects = remove_tree($extract_dir, { safe => 1 });
1033 print "Removed directory $extract_dir with $objects objects.\n";
1034};
1035
1036
1037print "\n";
1038
Nils Diewald2db9ad02013-10-29 19:26:43 +00001039__END__
Akron941c1a62016-02-23 17:41:41 +01001040
1041=pod
1042
1043=encoding utf8
1044
1045=head1 NAME
1046
Akronf7ad89e2016-03-16 18:22:47 +01001047korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001048
1049
1050=head1 SYNOPSIS
1051
Akrona76d8352016-10-27 16:27:32 +02001052 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001053
Akron2fd402b2016-10-27 21:26:48 +02001054
Akron941c1a62016-02-23 17:41:41 +01001055=head1 DESCRIPTION
1056
1057L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1058compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001059The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001060
1061
1062=head1 INSTALLATION
1063
1064The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1065
Akronaf386982016-10-12 00:33:25 +02001066 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001067
Akronc13a1702016-03-15 19:33:14 +01001068In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001069be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001070Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001071In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001072
1073=head1 ARGUMENTS
1074
Akrona76d8352016-10-27 16:27:32 +02001075 $ korapxml2krill -z --input <directory> --output <filename>
1076
1077Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001078It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001079
Akron941c1a62016-02-23 17:41:41 +01001080=over 2
1081
1082=item B<archive>
1083
Akron081639e2017-04-21 19:01:39 +02001084 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001085
Akron2fd402b2016-10-27 21:26:48 +02001086Converts an archive of KorAP-XML documents. It expects a directory
1087(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001088
1089=item B<extract>
1090
Akrona76d8352016-10-27 16:27:32 +02001091 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1092
1093Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001094
Akron63f20d42017-04-10 23:40:29 +02001095=item B<serial>
1096
1097 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1098
1099Convert archives sequentially. The inputs are not merged but treated
1100as they are (so they may be premerged or globs).
1101the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001102are created based on the archive name. In case the C<--to-tar> flag is given,
1103the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001104
1105
Akron941c1a62016-02-23 17:41:41 +01001106=back
1107
1108
1109=head1 OPTIONS
1110
1111=over 2
1112
Akrona76d8352016-10-27 16:27:32 +02001113=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001114
Akrona76d8352016-10-27 16:27:32 +02001115Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001116
Akron7606afa2016-10-25 16:23:49 +02001117Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001118document, while C<archive> expects a KorAP-XML corpus folder or a zip
1119file to batch process multiple files.
1120C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001121
Akrona76d8352016-10-27 16:27:32 +02001122C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001123that the first archive listed contains all primary data files
1124and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001125
Akron7606afa2016-10-25 16:23:49 +02001126 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001127
Akron821db3d2017-04-06 21:19:31 +02001128Input may also be defined using BSD glob wildcards.
1129
1130 -i 'file/news*.zip'
1131
1132The extended input array will be sorted in length order, so the shortest
1133path needs to contain all primary data files and all meta data files.
1134
Akron0c3e3752016-06-28 15:55:53 +02001135(The directory structure follows the base directory format,
1136that may include a C<.> root folder.
1137In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001138need to be passed with a hash sign in front of the archive's name.
1139This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001140
Akron7606afa2016-10-25 16:23:49 +02001141To support zip files, a version of C<unzip> needs to be installed that is
1142compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001143
Akron7606afa2016-10-25 16:23:49 +02001144B<The root folder switch using the hash sign is experimental and
1145may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001146
Akronf73ffb62018-06-27 12:13:59 +02001147
Akron63f20d42017-04-10 23:40:29 +02001148=item B<--input-base|-ib> <directory>
1149
1150The base directory for inputs.
1151
1152
Akron941c1a62016-02-23 17:41:41 +01001153=item B<--output|-o> <directory|file>
1154
1155Output folder for archive processing or
1156document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001157writes to C<STDOUT> by default
1158(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001159
1160=item B<--overwrite|-w>
1161
1162Overwrite files that already exist.
1163
Akronf73ffb62018-06-27 12:13:59 +02001164
Akron3741f8b2016-12-21 19:55:21 +01001165=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001166
1167Define the default tokenization by specifying
1168the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001169of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001170
Akron3741f8b2016-12-21 19:55:21 +01001171
1172=item B<--base-sentences|-bs> <foundry>#<layer>
1173
1174Define the layer for base sentences.
1175If given, this will be used instead of using C<Base#Sentences>.
1176Currently C<DeReKo#Structure> is the only additional layer supported.
1177
1178 Defaults to unset.
1179
1180
1181=item B<--base-paragraphs|-bp> <foundry>#<layer>
1182
1183Define the layer for base paragraphs.
1184If given, this will be used instead of using C<Base#Paragraphs>.
1185Currently C<DeReKo#Structure> is the only additional layer supported.
1186
1187 Defaults to unset.
1188
1189
Akron41ac10b2017-02-08 22:47:25 +01001190=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1191
1192Define the layer for base pagebreaks.
1193Currently C<DeReKo#Structure> is the only layer supported.
1194
1195 Defaults to unset.
1196
1197
Akron941c1a62016-02-23 17:41:41 +01001198=item B<--skip|-s> <foundry>[#<layer>]
1199
Akronf7ad89e2016-03-16 18:22:47 +01001200Skip specific annotations by specifying the foundry
1201(and optionally the layer with a C<#>-prefix),
1202e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001203Can be set multiple times.
1204
Akronf73ffb62018-06-27 12:13:59 +02001205
Akronc13a1702016-03-15 19:33:14 +01001206=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001207
Akronf7ad89e2016-03-16 18:22:47 +01001208Convert specific annotations by specifying the foundry
1209(and optionally the layer with a C<#>-prefix),
1210e.g. C<Mate> or C<Mate#Morpho>.
1211Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001212
Akronf73ffb62018-06-27 12:13:59 +02001213
Akron941c1a62016-02-23 17:41:41 +01001214=item B<--primary|-p>
1215
Akronc13a1702016-03-15 19:33:14 +01001216Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001217Can be flagged using C<--no-primary> as well.
1218This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001219
Akronf73ffb62018-06-27 12:13:59 +02001220
Akron941c1a62016-02-23 17:41:41 +01001221=item B<--jobs|-j>
1222
1223Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001224for archive processing.
Akron11c80302016-03-18 19:44:43 +01001225Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001226
1227If C<sequential-extraction> is not set to false, this will
1228also apply to extraction.
1229
Akronc11f7982017-02-21 21:20:14 +01001230Pass -1, and the value will be set automatically to 5
1231times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001232This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001233
Akronf73ffb62018-06-27 12:13:59 +02001234
Akron9ec88872017-04-12 16:29:06 +02001235=item B<--sequential-extraction|-se>
1236
1237Flag to indicate, if the C<jobs> value also applies to extraction.
1238Some systems may have problems with extracting multiple archives
1239to the same folder at the same time.
1240Can be flagged using C<--no-sequential-extraction> as well.
1241Defaults to C<false>.
1242
Akronf73ffb62018-06-27 12:13:59 +02001243
Akron35db6e32016-03-17 22:42:22 +01001244=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001245
Akron35db6e32016-03-17 22:42:22 +01001246Define the metadata parser to use. Defaults to C<I5>.
1247Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1248This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001249
Akronf73ffb62018-06-27 12:13:59 +02001250
Akron941c1a62016-02-23 17:41:41 +01001251=item B<--pretty|-y>
1252
Akronc13a1702016-03-15 19:33:14 +01001253Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001254This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001255
Akronf73ffb62018-06-27 12:13:59 +02001256
Akron941c1a62016-02-23 17:41:41 +01001257=item B<--gzip|-z>
1258
Akronf7ad89e2016-03-16 18:22:47 +01001259Compress the output.
1260Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001261
Akronf73ffb62018-06-27 12:13:59 +02001262
Akron11c80302016-03-18 19:44:43 +01001263=item B<--cache|-c>
1264
1265File to mmap a cache (using L<Cache::FastMmap>).
1266Defaults to C<korapxml2krill.cache> in the calling directory.
1267
Akronf73ffb62018-06-27 12:13:59 +02001268
Akron11c80302016-03-18 19:44:43 +01001269=item B<--cache-size|-cs>
1270
1271Size of the cache. Defaults to C<50m>.
1272
Akronf73ffb62018-06-27 12:13:59 +02001273
Akron11c80302016-03-18 19:44:43 +01001274=item B<--cache-init|-ci>
1275
1276Initialize cache file.
1277Can be flagged using C<--no-cache-init> as well.
1278Defaults to C<true>.
1279
Akronf73ffb62018-06-27 12:13:59 +02001280
Akron11c80302016-03-18 19:44:43 +01001281=item B<--cache-delete|-cd>
1282
1283Delete cache file after processing.
1284Can be flagged using C<--no-cache-delete> as well.
1285Defaults to C<true>.
1286
Akronf73ffb62018-06-27 12:13:59 +02001287
Akron636aa112017-04-07 18:48:56 +02001288=item B<--config|-cfg>
1289
1290Configure the parameters of your call in a file
1291of key-value pairs with whitespace separator
1292
1293 overwrite 1
1294 token DeReKo#Structure
1295 ...
1296
1297Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001298C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001299C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001300C<output>,
1301C<temp-extract>, C<sequential-extraction>,
1302C<base-sentences>, C<base-paragraphs>,
1303C<base-pagebreaks>,
1304C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001305(semicolon separated), C<anno> (semicolon separated).
1306
Akronf73ffb62018-06-27 12:13:59 +02001307Configuration parameters will always be overwritten by
1308passed parameters.
1309
1310
Akron81500102017-04-07 20:45:44 +02001311=item B<--temporary-extract|-te>
1312
1313Only valid for the C<archive> command.
1314
1315This will first extract all files into a
1316directory and then will archive.
1317If the directory is given as C<:temp:>,
1318a temporary directory is used.
1319This is especially useful to avoid
1320massive unzipping and potential
1321network latency.
Akron636aa112017-04-07 18:48:56 +02001322
Akronf73ffb62018-06-27 12:13:59 +02001323
Akrone10ad322016-02-27 10:54:26 +01001324=item B<--sigle|-sg>
1325
Akron20807582016-10-26 17:11:34 +02001326Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001327Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001328I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001329Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001330In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001331On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001332
Akronf73ffb62018-06-27 12:13:59 +02001333
Akron941c1a62016-02-23 17:41:41 +01001334=item B<--log|-l>
1335
1336The L<Log4perl> log level, defaults to C<ERROR>.
1337
Akronf73ffb62018-06-27 12:13:59 +02001338
Akron941c1a62016-02-23 17:41:41 +01001339=item B<--help|-h>
1340
1341Print this document.
1342
Akronf73ffb62018-06-27 12:13:59 +02001343
Akron941c1a62016-02-23 17:41:41 +01001344=item B<--version|-v>
1345
1346Print version information.
1347
1348=back
1349
Akronf73ffb62018-06-27 12:13:59 +02001350
Akronc13a1702016-03-15 19:33:14 +01001351=head1 ANNOTATION SUPPORT
1352
1353L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1354developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1355The base foundry with paragraphs, sentences, and the text element are mandatory for
1356L<Krill|https://github.com/KorAP/Krill>.
1357
Akron821db3d2017-04-06 21:19:31 +02001358 Base
1359 #Paragraphs
1360 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001361
Akron821db3d2017-04-06 21:19:31 +02001362 Connexor
1363 #Morpho
1364 #Phrase
1365 #Sentences
1366 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001367
Akron821db3d2017-04-06 21:19:31 +02001368 CoreNLP
1369 #Constituency
1370 #Morpho
1371 #NamedEntities
1372 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001373
Akronce125b62017-06-19 11:54:36 +02001374 CMC
1375 #Morpho
1376
Akron821db3d2017-04-06 21:19:31 +02001377 DeReKo
1378 #Structure
Akronc13a1702016-03-15 19:33:14 +01001379
Akron821db3d2017-04-06 21:19:31 +02001380 DRuKoLa
1381 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001382
Akron821db3d2017-04-06 21:19:31 +02001383 Glemm
1384 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001385
Akron4c679192018-01-16 17:41:49 +01001386 LWC
1387 #Dependency
1388
Akron821db3d2017-04-06 21:19:31 +02001389 Malt
1390 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001391
Akron821db3d2017-04-06 21:19:31 +02001392 MarMoT
1393 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001394
Akron821db3d2017-04-06 21:19:31 +02001395 Mate
1396 #Dependency
1397 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001398
Akron821db3d2017-04-06 21:19:31 +02001399 MDParser
1400 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001401
Akron821db3d2017-04-06 21:19:31 +02001402 OpenNLP
1403 #Morpho
1404 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001405
Akron821db3d2017-04-06 21:19:31 +02001406 Sgbr
1407 #Lemma
1408 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001409
Akron821db3d2017-04-06 21:19:31 +02001410 TreeTagger
1411 #Morpho
1412 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001413
Akron821db3d2017-04-06 21:19:31 +02001414 XIP
1415 #Constituency
1416 #Morpho
1417 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001418
Akronc13a1702016-03-15 19:33:14 +01001419
1420More importers are in preparation.
1421New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1422See the built-in annotation importers as examples.
1423
Akronf73ffb62018-06-27 12:13:59 +02001424
Akron941c1a62016-02-23 17:41:41 +01001425=head1 AVAILABILITY
1426
1427 https://github.com/KorAP/KorAP-XML-Krill
1428
1429
1430=head1 COPYRIGHT AND LICENSE
1431
Akron4c679192018-01-16 17:41:49 +01001432Copyright (C) 2015-2018, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001433
Akron941c1a62016-02-23 17:41:41 +01001434Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001435
Akrona76d8352016-10-27 16:27:32 +02001436Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001437
1438L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1439Corpus Analysis Platform at the
1440L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1441member of the
1442L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1443
1444This program is free software published under the
1445L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1446
1447=cut