blob: 7105233f0cc240955dcd4be9fb2916f86eb7e67a [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akron941c1a62016-02-23 17:41:41 +0100132# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100133
Akron5fdc7e12018-07-19 12:37:48 +0200134our $LAST_CHANGE = '2018/07/19';
Akron941c1a62016-02-23 17:41:41 +0100135our $LOCAL = $FindBin::Bin;
136our $VERSION_MSG = <<"VERSION";
137Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
138VERSION
139
Akron63f20d42017-04-10 23:40:29 +0200140# Prototypes
141sub get_file_name_from_glob($);
142sub get_file_name($);
143
Akron941c1a62016-02-23 17:41:41 +0100144# Parse comand
145my $cmd;
146our @ARGV;
147if ($ARGV[0] && index($ARGV[0], '-') != 0) {
148 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100149};
Akron63f20d42017-04-10 23:40:29 +0200150my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100151
Akron5f51d422016-08-16 16:26:43 +0200152my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100153my $text;
Akrone10ad322016-02-27 10:54:26 +0100154
Akron941c1a62016-02-23 17:41:41 +0100155# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000156GetOptions(
Akron08385f62016-03-22 20:37:04 +0100157 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200158 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100159 'output|o=s' => \(my $output),
160 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100161 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200162 'token|t=s' => \(my $token_base),
163 'base-sentences|bs=s' => \(my $base_sentences),
164 'base-paragraphs|bp=s' => \(my $base_paragraphs),
165 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100166 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200167 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100168 'skip|s=s' => \@skip,
169 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200170 'cache|c=s' => \(my $cache_file),
171 'config|cfg=s' => \(my $cfg_file),
172 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200173 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100174 'primary|p!' => \(my $primary),
175 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200176 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200177 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200178 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200179 'cache-size|cs=s' => \(my $cache_size),
180 'cache-delete|cd!' => \(my $cache_delete),
181 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100182 'help|h' => sub {
183 pod2usage(
184 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200185 -verbose => 99,
186 -msg => $VERSION_MSG,
187 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100188 );
189 },
190 'version|v' => sub {
191 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200192 -verbose => 0,
193 -msg => $VERSION_MSG,
194 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100195 )
196 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000197);
198
Akron63f20d42017-04-10 23:40:29 +0200199
Akron636aa112017-04-07 18:48:56 +0200200# Load from configuration
201if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200202 my %config;
203
204 Config::Simple->import_from($cfg_file, \%config);
205
206 # Overwrite
207 if (!defined($overwrite) && defined $config{overwrite}) {
208 $overwrite = $config{overwrite};
209 };
210
211 # Gzip
212 if (!defined($gzip) && defined $config{gzip}) {
213 $gzip = $config{gzip};
214 };
215
216 # Jobs
217 if (!defined($jobs) && defined $config{jobs}) {
218 $jobs = $config{jobs};
219 };
220
Akron63f20d42017-04-10 23:40:29 +0200221 # Input root base directory
222 if (!defined($input_base) && defined $config{'input-base'}) {
223 $input_base = $config{'input-base'};
224 };
225
Akron81500102017-04-07 20:45:44 +0200226 # temporary-extract
227 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
228 $extract_dir = $config{'temporary-extract'};
229 };
230
Akron636aa112017-04-07 18:48:56 +0200231 # Token base
232 if (!defined($token_base) && defined $config{token}) {
233 $token_base = $config{token};
234 };
235
236 # Cache file
237 if (!defined($cache_file) && defined $config{cache}) {
238 $cache_file = $config{cache};
239 };
240
241 # Cache size
242 if (!defined($cache_size) && defined $config{'cache-size'}) {
243 $cache_size = $config{'cache-size'};
244 };
245
246 # Cache delete
247 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
248 $cache_delete = $config{'cache-delete'} ;
249 };
250
251 # Cache init
252 if (!(defined $cache_init) && defined $config{'cache-init'}) {
253 $cache_init = $config{'cache-init'} ;
254 };
255
Akron9ec88872017-04-12 16:29:06 +0200256 # Jobs for extraction
257 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
258 $sequential_extraction = $config{'sequential-extraction'} ;
259 };
260
Akron636aa112017-04-07 18:48:56 +0200261 # Meta
262 if (!(defined $meta) && defined $config{'meta'}) {
263 $meta = $config{'meta'} ;
264 };
265
266 # Output
267 if (!(defined $output) && defined $config{'output'}) {
268 $output = $config{'output'} ;
269 };
270
271 # Base-sentences
272 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
273 $base_sentences = $config{'base-sentences'} ;
274 };
275
276 # Base-paragraphs
277 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
278 $base_paragraphs = $config{'base-paragraphs'} ;
279 };
280
281 # Base-pagebreaks
282 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
283 $base_pagebreaks = $config{'base-pagebreaks'} ;
284 };
285
Akron081639e2017-04-21 19:01:39 +0200286 # Write to tar
287 if (!(defined $to_tar) && defined $config{'to-tar'}) {
288 $to_tar = $config{'to-tar'} ;
289 };
290
Akron636aa112017-04-07 18:48:56 +0200291 # Log
292 if (!(defined $log_level) && defined $config{'log'}) {
293 $log_level = $config{'log'} ;
294 };
295
296 # Skip
297 if (!scalar(@skip) && defined $config{'skip'}) {
298 @skip = split /\s*;\s*/, $config{'skip'} ;
299 };
300
301 # Sigle
302 if (!scalar(@sigle) && defined $config{'sigle'}) {
303 @sigle = split /\s*;\s*/, $config{'sigle'} ;
304 };
305
306 # Anno
307 if (!scalar(@anno) && defined $config{'anno'}) {
308 @anno = split /\s*;\s*/, $config{'anno'} ;
309 };
310};
311
Akron63f20d42017-04-10 23:40:29 +0200312
Akron636aa112017-04-07 18:48:56 +0200313# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200314$token_base //= 'OpenNLP#tokens';
315$cache_file //= 'korapxml2krill.cache';
316$cache_size //= '50m';
317$jobs //= 0;
318$cache_delete //= 1;
319$cache_init //= 1;
320$sequential_extraction //= 0;
321$log_level //= 'ERROR';
322$base_sentences //= '';
323$base_paragraphs //= '';
324$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200325
Akron821db3d2017-04-06 21:19:31 +0200326$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100327$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100328$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100329
Akron63f20d42017-04-10 23:40:29 +0200330
331# Initialize log4perl object
332Log::Log4perl->init({
333 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
334 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
335 'log4perl.appender.STDERR.layout' => 'PatternLayout',
336 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
337});
338
339my $log = Log::Log4perl->get_logger('main');
340
341
342print "Reading config from $cfg_file\n" if $cfg_file;
343
344
Akron941c1a62016-02-23 17:41:41 +0100345my %ERROR_HASH = (
346 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200347 -verbose => 99,
348 -msg => $VERSION_MSG,
349 -output => '-',
350 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100351);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000352
Akron941c1a62016-02-23 17:41:41 +0100353# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100354pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000355
Akrone1dbc382016-07-08 22:24:52 +0200356# Gzip has no effect, if no output is given
357pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000358
Akronc11f7982017-02-21 21:20:14 +0100359
Akron636aa112017-04-07 18:48:56 +0200360if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100361 state $cores = Sys::Info->new->device('CPU')->count;
362 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200363 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100364};
365
Akron821db3d2017-04-06 21:19:31 +0200366
Akron63f20d42017-04-10 23:40:29 +0200367# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200368if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200369
Akron486f9ab2017-04-22 23:25:19 +0200370 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200371 $log->error("Directory '$output' does not exist.");
372 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200373 };
374
375 # Remove all inputs
376 my $remove_next = 0;
377 @keep_argv = @{c(@keep_argv)->grep(
378 sub {
379 # Input flag
380 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
381 $remove_next = 1;
382 return 0;
383 }
384
385 # input value
386 elsif ($remove_next) {
387 $remove_next = 0;
388 return 0;
389 };
390
391 # Pass parameter
392 return 1;
393 }
394 )->to_array};
395
396
397 # Iterate over all inputs
398 foreach (@input) {
399
Akron081639e2017-04-21 19:01:39 +0200400 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200401 my $new_out = catdir($output, get_file_name_from_glob($_));
402
Akron486f9ab2017-04-22 23:25:19 +0200403 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200404 unless ($to_tar) {
405 if (make_path($new_out) == 0 && !-d $new_out) {
406 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200407 exit 1;
Akron081639e2017-04-21 19:01:39 +0200408 };
Akron63f20d42017-04-10 23:40:29 +0200409 };
410
411 # Create archive command
412 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
413 print "Start serial processing of $_ to $new_out\n";
414
415 # Start archiving
416 system @archive_cmd;
417 };
418
Akron3abc03e2017-06-29 16:23:35 +0200419 exit;
Akron63f20d42017-04-10 23:40:29 +0200420};
421
Akrone1dbc382016-07-08 22:24:52 +0200422my %skip;
423$skip{lc($_)} = 1 foreach @skip;
424
425my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100426push(@layers, ['Base', 'Sentences']) unless $base_sentences;
427push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200428
429# Connexor
430push(@layers, ['Connexor', 'Morpho']);
431push(@layers, ['Connexor', 'Syntax']);
432push(@layers, ['Connexor', 'Phrase']);
433push(@layers, ['Connexor', 'Sentences']);
434
435# CoreNLP
436push(@layers, ['CoreNLP', 'NamedEntities']);
437push(@layers, ['CoreNLP', 'Sentences']);
438push(@layers, ['CoreNLP', 'Morpho']);
439push(@layers, ['CoreNLP', 'Constituency']);
440
Akronce125b62017-06-19 11:54:36 +0200441# CMC
442push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100443
Akrone1dbc382016-07-08 22:24:52 +0200444# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100445my @dereko_attr = ();
446if ($base_sentences eq 'dereko#structure') {
447 push @dereko_attr, 'sentences';
448};
449if ($base_paragraphs eq 'dereko#structure') {
450 push @dereko_attr, 'paragraphs';
451};
Akron636bd9c2017-02-09 17:13:00 +0100452
Akron41ac10b2017-02-08 22:47:25 +0100453if ($base_pagebreaks eq 'dereko#structure') {
454 push @dereko_attr, 'pagebreaks';
455};
456
457if ($dereko_attr[0]) {
458 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100459}
460else {
461 push(@layers, ['DeReKo', 'Structure']);
462};
Akrone1dbc382016-07-08 22:24:52 +0200463
464# Glemm
465push(@layers, ['Glemm', 'Morpho']);
466
Akronea1aed52018-07-19 14:43:34 +0200467# HNC
468push(@layers, ['HNC', 'Morpho']);
469
Akron4c679192018-01-16 17:41:49 +0100470# LWC
471push(@layers, ['LWC', 'Dependency']);
472
Akrone1dbc382016-07-08 22:24:52 +0200473# Malt
474push(@layers, ['Malt', 'Dependency']);
475
476# MDParser
477push(@layers, ['MDParser', 'Dependency']);
478
479# Mate
480push(@layers, ['Mate', 'Morpho']);
481push(@layers, ['Mate', 'Dependency']);
482
483# OpenNLP
484push(@layers, ['OpenNLP', 'Morpho']);
485push(@layers, ['OpenNLP', 'Sentences']);
486
487# Schreibgebrauch
488push(@layers, ['Sgbr', 'Lemma']);
489push(@layers, ['Sgbr', 'Morpho']);
490
491# TreeTagger
492push(@layers, ['TreeTagger', 'Morpho']);
493push(@layers, ['TreeTagger', 'Sentences']);
494
495# XIP
496push(@layers, ['XIP', 'Morpho']);
497push(@layers, ['XIP', 'Constituency']);
498push(@layers, ['XIP', 'Sentences']);
499push(@layers, ['XIP', 'Dependency']);
500
Akron4fa37c32017-01-20 14:43:10 +0100501# DRuKoLa
502push(@layers, ['DRuKoLa', 'Morpho']);
503
Akron3bd942f2017-02-20 20:09:14 +0100504# Marmot
505push(@layers, ['MarMoT', 'Morpho']);
506
Akron4fa37c32017-01-20 14:43:10 +0100507
Akrone1dbc382016-07-08 22:24:52 +0200508# Check filters
509my @filtered_anno;
510if ($skip{'#all'}) {
511 foreach (@anno) {
512 push @filtered_anno, [ split('#', $_) ];
513 };
514}
515
516# Add all annotations that are not skipped
517else {
518 # Add to index file - respect skipping
519 foreach my $info (@layers) {
520 # Skip if Foundry or Foundry#Layer should be skipped
521 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
522 push @filtered_anno, $info;
523 };
524 };
525};
526
527# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200528my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
529
530# Remove file extension
531$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200532
533# TODO: This should not be initialized for batch
534my $cache = Cache::FastMmap->new(
535 share_file => $cache_file,
536 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200537 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200538);
539
Akron03b24db2016-08-16 20:54:32 +0200540# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200541my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200542 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200543 meta_type => $meta,
544 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200545 foundry => $token_base_foundry,
546 layer => $token_base_layer,
547 gzip => $gzip,
548 log => $log,
549 primary => $primary,
550 pretty => $pretty,
551 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200552);
553
Akron941c1a62016-02-23 17:41:41 +0100554# Get file name based on path information
555sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100556 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200557 if (-d $i) {
558 $i =~ s![^\/]+$!!;
559 };
Akron941c1a62016-02-23 17:41:41 +0100560 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200561
562 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200563 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100564 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100565 $file =~ tr/\//-/;
566 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200567 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100568 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000569};
570
Akron63f20d42017-04-10 23:40:29 +0200571
572sub get_file_name_from_glob ($) {
573 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200574 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200575 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
576 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
577 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
578 $glob =~ s/^-//; # Clean beginning
579 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200580 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200581 return $glob;
582};
583
584
Akrone10ad322016-02-27 10:54:26 +0100585# Convert sigle to path construct
586s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
587
Akron7d4cdd82016-08-17 21:39:45 +0200588if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200589 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200590 $log->error("Directory '$output' does not exist.");
591 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200592 };
593};
594
Akron63f20d42017-04-10 23:40:29 +0200595
596# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200597if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200598
Akron821db3d2017-04-06 21:19:31 +0200599 my @new_input = ();
600
601 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200602 foreach my $wild_card (@input) {
603
604 # Prefix with input root
605 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
606
607 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200608 };
609
Akron63f20d42017-04-10 23:40:29 +0200610 # Sort files by length
611 @input = sort { length($a) <=> length($b) } @new_input;
612
613 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200614};
615
616
Akron941c1a62016-02-23 17:41:41 +0100617# Process a single file
618unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100619 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000620
Akron941c1a62016-02-23 17:41:41 +0100621 BEGIN {
622 $main::TIME = Benchmark->new;
623 $main::LAST_STOP = Benchmark->new;
624 };
625
626 sub stop_time {
627 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200628 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100629 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200630 timestr(timediff($new, $main::LAST_STOP)) .
631 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
632 );
Akron941c1a62016-02-23 17:41:41 +0100633 $main::LAST_STOP = $new;
634 };
635
636 # Create and parse new document
637 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100638
Akron7d4cdd82016-08-17 21:39:45 +0200639 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200640 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100641
Akron11c80302016-03-18 19:44:43 +0100642 # Delete cache file
643 unlink($cache_file) if $cache_delete;
644
Akron5f51d422016-08-16 16:26:43 +0200645 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200646 exit;
Akron81500102017-04-07 20:45:44 +0200647};
648
Nils Diewald59094f22014-11-05 18:20:50 +0000649
Akrone10ad322016-02-27 10:54:26 +0100650# Extract XML files
Akron81500102017-04-07 20:45:44 +0200651if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100652
Akrond5643ad2017-07-04 20:27:13 +0200653 # Output is required
654 pod2usage(%ERROR_HASH) unless $output;
655
Akron7d4cdd82016-08-17 21:39:45 +0200656 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200657 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100658
Akron7d4cdd82016-08-17 21:39:45 +0200659 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100660 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200661 $log->error("Unzip is not installed or incompatible.");
662 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100663 };
664
Akronb0c88db2016-06-29 16:33:18 +0200665 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200666 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200667
Akron651cb8d2016-08-16 21:44:49 +0200668 my $prefix = 1;
669
Akron03b24db2016-08-16 20:54:32 +0200670 # No sigles given
671 unless (@sigle) {
672
673 # Get files
674 foreach ($archive->list_texts) {
675
676 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200677 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200678
679 # TODO: Make this OS independent
680 push @sigle, join '/', $corpus, $doc, $text;
681 };
Akron20807582016-10-26 17:11:34 +0200682 }
683
684 # Check sigle for doc sigles
685 else {
686 my @new_sigle;
687
688 my $prefix_check = 0;
689
690 # Iterate over all sigle
691 foreach (@sigle) {
692
693 # Sigle is a doc sigle
694 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200695
Akron60a8caa2017-02-17 21:51:27 +0100696 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200697 # Check if a prefix is needed
698 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100699
700 if ($prefix = $archive->check_prefix) {
701 print " with prefix ...";
702 };
Akron20807582016-10-26 17:11:34 +0200703 $prefix_check = 1;
704 };
705
Akron60a8caa2017-02-17 21:51:27 +0100706 print "\n";
707
Akron20807582016-10-26 17:11:34 +0200708 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200709 my $path = ($prefix ? './' : '') . $_;
710
711 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200712 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200713 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200714 ) ? '' : 'not '
715 );
716 print "extracted.\n";
717 }
Akron60a8caa2017-02-17 21:51:27 +0100718
719 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200720 else {
721 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100722
723 unless ($prefix_check) {
724
725 if ($prefix = $archive->check_prefix) {
726 print " with prefix ...";
727 };
728 $prefix_check = 1;
729 };
Akron20807582016-10-26 17:11:34 +0200730 };
731 };
732 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200733 };
734
Akrone10ad322016-02-27 10:54:26 +0100735 # Iterate over all given sigles and extract
736 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100737
Akron2812ba22016-10-28 21:55:59 +0200738 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200739
Akron03b24db2016-08-16 20:54:32 +0200740 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200741 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100742
Akron20807582016-10-26 17:11:34 +0200743 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200744 ($prefix ? './' : '') . $_, $output
745 ) ? '' : 'not '
746 );
Akrone10ad322016-02-27 10:54:26 +0100747 print "extracted.\n";
748 };
Akronb0c88db2016-06-29 16:33:18 +0200749 }
Akron7d4cdd82016-08-17 21:39:45 +0200750
751 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200752 else {
753 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200754 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100755 };
756}
757
Akron81500102017-04-07 20:45:44 +0200758
Akron941c1a62016-02-23 17:41:41 +0100759# Process an archive
760elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000761
Akron81500102017-04-07 20:45:44 +0200762 my $archive_output;
763
764 # First extract, then archive
765 if (defined $extract_dir) {
766
767 # Create new archive object
768 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
769
770 # Check zip capabilities
771 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200772 $log->error("Unzip is not installed or incompatible.");
773 exit 1;
Akron81500102017-04-07 20:45:44 +0200774 };
775
776 # Add further annotation archived
777 $archive->attach($_) foreach @input[1..$#input];
778
779 # Create a temporary directory
780 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200781 $extract_dir = tempdir(CLEANUP => 0);
782 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200783 };
784
Akron63f20d42017-04-10 23:40:29 +0200785 # Add some random extra to avoid clashes with multiple archives
786 $extract_dir = catdir($extract_dir, random_string('cccccc'));
787
788 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200789 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200790 @input = ($extract_dir);
791 }
792 else {
793 $log->error('Unable to extract from primary archive ' . $input[0] .
794 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200795 exit 1;
Akron81500102017-04-07 20:45:44 +0200796 };
797 }
798
799 # Can't create archive object
800 else {
801 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200802 exit 1;
Akron81500102017-04-07 20:45:44 +0200803 };
804 };
805
Akrone1dbc382016-07-08 22:24:52 +0200806 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100807
Akron7d4cdd82016-08-17 21:39:45 +0200808 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100809 my $pool = Parallel::ForkManager->new($jobs);
810
Akron7d4cdd82016-08-17 21:39:45 +0200811 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100812 my $iter = 1; # Current text in process
813
Akronda3097e2017-04-23 19:53:57 +0200814 my $tar_archive;
815 my $output_dir = $output;
816 my $tar_fh;
817
818 # Initialize tar archive
819 if ($to_tar) {
820 $tar_archive = Archive::Tar::Builder->new(
821 ignore_errors => 1
822 );
823
824 # Set output name
825 my $tar_file = $output;
826 unless ($tar_file =~ /\.tar$/) {
827 $tar_file .= '.tar';
828 };
829
830 # Initiate the tar file
831 print "Writing to file $tar_file\n";
832 $tar_fh = IO::File->new($tar_file, 'w');
833 $tar_fh->binmode(1);
834
835 # Set handle
836 $tar_archive->set_handle($tar_fh);
837
838 # Output to temporary directory
839 $output_dir = File::Temp->newdir;
840 };
841
Akron941c1a62016-02-23 17:41:41 +0100842 # Report on fork message
843 $pool->run_on_finish (
844 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200845 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100846 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200847
Akron08385f62016-03-22 20:37:04 +0100848 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200849 ($iter++) . "/$count]" .
850 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200851 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200852
853 if (!$code && $to_tar && $data->[2]) {
854 my $filename = $data->[2];
855
856 # Lock filehandle
857 if (flock($tar_fh, LOCK_EX)) {
858
Akron9a062ce2017-07-04 19:12:05 +0200859 my $clean_file = fileparse($filename);
860
Akronda3097e2017-04-23 19:53:57 +0200861 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200862 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200863 unlink $filename;
864
865 # Unlock filehandle
866 flock($tar_fh, LOCK_UN);
867 }
868 else {
869 $log->warn("Unable to add $filename to archive");
870 };
871 };
872
Akron4c0cf312016-10-15 16:42:09 +0200873 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100874 }
875 );
876
877 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200878 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100879 print "Reading data ...\n";
880
Akron7d4cdd82016-08-17 21:39:45 +0200881 # unless (Cache::FastMmap->new(
882 # share_file => $cache_file,
883 # cache_size => $cache_size,
884 # init_file => $cache_init
885 # )) {
886 # print "Unable to intialize cache '$cache_file'\n\n";
887 # exit(1);
888 # };
Akron11c80302016-03-18 19:44:43 +0100889
Akron486f9ab2017-04-22 23:25:19 +0200890
Akron941c1a62016-02-23 17:41:41 +0100891 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100892 if (-d $input[0]) {
893 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100894 my @dirs;
895 my $dir;
896
Akron7d4cdd82016-08-17 21:39:45 +0200897 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100898 while (1) {
899 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200900 push @dirs, $dir;
901 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100902 };
903 last unless $it->next;
904 };
905
906 print "Start processing ...\n";
907 $t = Benchmark->new;
908 $count = scalar @dirs;
909
910 DIRECTORY_LOOP:
911 for (my $i = 0; $i < $count; $i++) {
912
Akrone1dbc382016-07-08 22:24:52 +0200913 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200914 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200915 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200916 );
Akron941c1a62016-02-23 17:41:41 +0100917
918 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200919 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200920
Akron13d56622016-10-31 14:54:49 +0100921 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200922 $pool->finish(
923 0,
Akronda3097e2017-04-23 19:53:57 +0200924 [
925 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
926 undef,
927 $filename
928 ]
Akron486f9ab2017-04-22 23:25:19 +0200929 );
Akron3ec48972016-08-17 23:24:52 +0200930 }
931 else {
Akron4c0cf312016-10-15 16:42:09 +0200932 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200933 };
Akron941c1a62016-02-23 17:41:41 +0100934 };
935 }
936
937 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200938 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200939
Akron941c1a62016-02-23 17:41:41 +0100940 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200941 $log->error("Unzip is not installed or incompatible.");
942 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100943 };
944
Akron08385f62016-03-22 20:37:04 +0100945 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200946 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100947
Akron941c1a62016-02-23 17:41:41 +0100948 print "Start processing ...\n";
949 $t = Benchmark->new;
950 my @dirs = $archive->list_texts;
951 $count = scalar @dirs;
952
953 ARCHIVE_LOOP:
954 for (my $i = 0; $i < $count; $i++) {
955
956 # Split path information
957 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
958
Akrone1dbc382016-07-08 22:24:52 +0200959 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200960 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200961 get_file_name(
962 catfile($corpus, $doc, $text)
963 . '.json' . ($gzip ? '.gz' : '')
964 )
Akrone1dbc382016-07-08 22:24:52 +0200965 );
Akron941c1a62016-02-23 17:41:41 +0100966
967 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200968 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100969
Akron4c0cf312016-10-15 16:42:09 +0200970 # Create temporary file
971 $temp = File::Temp->newdir;
972
Akronbdf434a2016-10-24 17:42:07 +0200973 # TODO: Check if $filename exist at the beginning,
974 # because extraction can be horrible slow!
975
Akron941c1a62016-02-23 17:41:41 +0100976 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200977 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100978
Akron7d4cdd82016-08-17 21:39:45 +0200979 # Create corpus directory
980 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100981
Akron7d4cdd82016-08-17 21:39:45 +0200982 # Temporary directory
983 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100984
Akron7d4cdd82016-08-17 21:39:45 +0200985 # Write file
Akron13d56622016-10-31 14:54:49 +0100986 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200987
Akron4c0cf312016-10-15 16:42:09 +0200988 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100989 $pool->finish(
990 0,
Akronda3097e2017-04-23 19:53:57 +0200991 [
992 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
993 $temp,
994 $filename
995 ]
Akron13d56622016-10-31 14:54:49 +0100996 );
997 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200998 }
999 else {
Akron4c0cf312016-10-15 16:42:09 +02001000 # Delete temporary file
1001 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001002 };
Akron941c1a62016-02-23 17:41:41 +01001003 }
Akron7d4cdd82016-08-17 21:39:45 +02001004
1005 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001006 else {
Akron4c0cf312016-10-15 16:42:09 +02001007 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001008 };
1009 };
1010 }
1011
1012 else {
1013 print "Input is neither a directory nor an archive.\n\n";
1014 };
1015
1016 $pool->wait_all_children;
1017
Akron11c80302016-03-18 19:44:43 +01001018 # Delete cache file
1019 unlink($cache_file) if $cache_delete;
1020
Akronda3097e2017-04-23 19:53:57 +02001021 # Close tar filehandle
1022 if ($to_tar && $tar_fh) {
1023 $tar_archive->finish;
1024 $tar_fh->close;
1025 print "Wrote to tar archive.\n";
1026 };
1027
Akron63f20d42017-04-10 23:40:29 +02001028 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001029 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001030};
Akron941c1a62016-02-23 17:41:41 +01001031
Nils Diewald2db9ad02013-10-29 19:26:43 +00001032
Akron63f20d42017-04-10 23:40:29 +02001033# Cleanup temporary extraction directory
1034if ($extract_dir) {
1035 my $objects = remove_tree($extract_dir, { safe => 1 });
1036 print "Removed directory $extract_dir with $objects objects.\n";
1037};
1038
1039
1040print "\n";
1041
Nils Diewald2db9ad02013-10-29 19:26:43 +00001042__END__
Akron941c1a62016-02-23 17:41:41 +01001043
1044=pod
1045
1046=encoding utf8
1047
1048=head1 NAME
1049
Akronf7ad89e2016-03-16 18:22:47 +01001050korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001051
1052
1053=head1 SYNOPSIS
1054
Akrona76d8352016-10-27 16:27:32 +02001055 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001056
Akron2fd402b2016-10-27 21:26:48 +02001057
Akron941c1a62016-02-23 17:41:41 +01001058=head1 DESCRIPTION
1059
1060L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1061compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001062The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001063
1064
1065=head1 INSTALLATION
1066
1067The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1068
Akronaf386982016-10-12 00:33:25 +02001069 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001070
Akronc13a1702016-03-15 19:33:14 +01001071In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001072be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001073Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001074In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001075
1076=head1 ARGUMENTS
1077
Akrona76d8352016-10-27 16:27:32 +02001078 $ korapxml2krill -z --input <directory> --output <filename>
1079
1080Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001081It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001082
Akron941c1a62016-02-23 17:41:41 +01001083=over 2
1084
1085=item B<archive>
1086
Akron081639e2017-04-21 19:01:39 +02001087 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001088
Akron2fd402b2016-10-27 21:26:48 +02001089Converts an archive of KorAP-XML documents. It expects a directory
1090(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001091
1092=item B<extract>
1093
Akrona76d8352016-10-27 16:27:32 +02001094 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1095
1096Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001097
Akron63f20d42017-04-10 23:40:29 +02001098=item B<serial>
1099
1100 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1101
1102Convert archives sequentially. The inputs are not merged but treated
1103as they are (so they may be premerged or globs).
1104the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001105are created based on the archive name. In case the C<--to-tar> flag is given,
1106the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001107
1108
Akron941c1a62016-02-23 17:41:41 +01001109=back
1110
1111
1112=head1 OPTIONS
1113
1114=over 2
1115
Akrona76d8352016-10-27 16:27:32 +02001116=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001117
Akrona76d8352016-10-27 16:27:32 +02001118Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001119
Akron7606afa2016-10-25 16:23:49 +02001120Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001121document, while C<archive> expects a KorAP-XML corpus folder or a zip
1122file to batch process multiple files.
1123C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001124
Akrona76d8352016-10-27 16:27:32 +02001125C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001126that the first archive listed contains all primary data files
1127and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001128
Akron7606afa2016-10-25 16:23:49 +02001129 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001130
Akron821db3d2017-04-06 21:19:31 +02001131Input may also be defined using BSD glob wildcards.
1132
1133 -i 'file/news*.zip'
1134
1135The extended input array will be sorted in length order, so the shortest
1136path needs to contain all primary data files and all meta data files.
1137
Akron0c3e3752016-06-28 15:55:53 +02001138(The directory structure follows the base directory format,
1139that may include a C<.> root folder.
1140In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001141need to be passed with a hash sign in front of the archive's name.
1142This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001143
Akron7606afa2016-10-25 16:23:49 +02001144To support zip files, a version of C<unzip> needs to be installed that is
1145compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001146
Akron7606afa2016-10-25 16:23:49 +02001147B<The root folder switch using the hash sign is experimental and
1148may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001149
Akronf73ffb62018-06-27 12:13:59 +02001150
Akron63f20d42017-04-10 23:40:29 +02001151=item B<--input-base|-ib> <directory>
1152
1153The base directory for inputs.
1154
1155
Akron941c1a62016-02-23 17:41:41 +01001156=item B<--output|-o> <directory|file>
1157
1158Output folder for archive processing or
1159document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001160writes to C<STDOUT> by default
1161(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001162
1163=item B<--overwrite|-w>
1164
1165Overwrite files that already exist.
1166
Akronf73ffb62018-06-27 12:13:59 +02001167
Akron3741f8b2016-12-21 19:55:21 +01001168=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001169
1170Define the default tokenization by specifying
1171the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001172of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001173
Akron3741f8b2016-12-21 19:55:21 +01001174
1175=item B<--base-sentences|-bs> <foundry>#<layer>
1176
1177Define the layer for base sentences.
1178If given, this will be used instead of using C<Base#Sentences>.
1179Currently C<DeReKo#Structure> is the only additional layer supported.
1180
1181 Defaults to unset.
1182
1183
1184=item B<--base-paragraphs|-bp> <foundry>#<layer>
1185
1186Define the layer for base paragraphs.
1187If given, this will be used instead of using C<Base#Paragraphs>.
1188Currently C<DeReKo#Structure> is the only additional layer supported.
1189
1190 Defaults to unset.
1191
1192
Akron41ac10b2017-02-08 22:47:25 +01001193=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1194
1195Define the layer for base pagebreaks.
1196Currently C<DeReKo#Structure> is the only layer supported.
1197
1198 Defaults to unset.
1199
1200
Akron941c1a62016-02-23 17:41:41 +01001201=item B<--skip|-s> <foundry>[#<layer>]
1202
Akronf7ad89e2016-03-16 18:22:47 +01001203Skip specific annotations by specifying the foundry
1204(and optionally the layer with a C<#>-prefix),
1205e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001206Can be set multiple times.
1207
Akronf73ffb62018-06-27 12:13:59 +02001208
Akronc13a1702016-03-15 19:33:14 +01001209=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001210
Akronf7ad89e2016-03-16 18:22:47 +01001211Convert specific annotations by specifying the foundry
1212(and optionally the layer with a C<#>-prefix),
1213e.g. C<Mate> or C<Mate#Morpho>.
1214Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001215
Akronf73ffb62018-06-27 12:13:59 +02001216
Akron941c1a62016-02-23 17:41:41 +01001217=item B<--primary|-p>
1218
Akronc13a1702016-03-15 19:33:14 +01001219Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001220Can be flagged using C<--no-primary> as well.
1221This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001222
Akronf73ffb62018-06-27 12:13:59 +02001223
Akron941c1a62016-02-23 17:41:41 +01001224=item B<--jobs|-j>
1225
1226Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001227for archive processing.
Akron11c80302016-03-18 19:44:43 +01001228Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001229
1230If C<sequential-extraction> is not set to false, this will
1231also apply to extraction.
1232
Akronc11f7982017-02-21 21:20:14 +01001233Pass -1, and the value will be set automatically to 5
1234times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001235This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001236
Akronf73ffb62018-06-27 12:13:59 +02001237
Akron9ec88872017-04-12 16:29:06 +02001238=item B<--sequential-extraction|-se>
1239
1240Flag to indicate, if the C<jobs> value also applies to extraction.
1241Some systems may have problems with extracting multiple archives
1242to the same folder at the same time.
1243Can be flagged using C<--no-sequential-extraction> as well.
1244Defaults to C<false>.
1245
Akronf73ffb62018-06-27 12:13:59 +02001246
Akron35db6e32016-03-17 22:42:22 +01001247=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001248
Akron35db6e32016-03-17 22:42:22 +01001249Define the metadata parser to use. Defaults to C<I5>.
1250Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1251This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001252
Akronf73ffb62018-06-27 12:13:59 +02001253
Akron941c1a62016-02-23 17:41:41 +01001254=item B<--pretty|-y>
1255
Akronc13a1702016-03-15 19:33:14 +01001256Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001257This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001258
Akronf73ffb62018-06-27 12:13:59 +02001259
Akron941c1a62016-02-23 17:41:41 +01001260=item B<--gzip|-z>
1261
Akronf7ad89e2016-03-16 18:22:47 +01001262Compress the output.
1263Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001264
Akronf73ffb62018-06-27 12:13:59 +02001265
Akron11c80302016-03-18 19:44:43 +01001266=item B<--cache|-c>
1267
1268File to mmap a cache (using L<Cache::FastMmap>).
1269Defaults to C<korapxml2krill.cache> in the calling directory.
1270
Akronf73ffb62018-06-27 12:13:59 +02001271
Akron11c80302016-03-18 19:44:43 +01001272=item B<--cache-size|-cs>
1273
1274Size of the cache. Defaults to C<50m>.
1275
Akronf73ffb62018-06-27 12:13:59 +02001276
Akron11c80302016-03-18 19:44:43 +01001277=item B<--cache-init|-ci>
1278
1279Initialize cache file.
1280Can be flagged using C<--no-cache-init> as well.
1281Defaults to C<true>.
1282
Akronf73ffb62018-06-27 12:13:59 +02001283
Akron11c80302016-03-18 19:44:43 +01001284=item B<--cache-delete|-cd>
1285
1286Delete cache file after processing.
1287Can be flagged using C<--no-cache-delete> as well.
1288Defaults to C<true>.
1289
Akronf73ffb62018-06-27 12:13:59 +02001290
Akron636aa112017-04-07 18:48:56 +02001291=item B<--config|-cfg>
1292
1293Configure the parameters of your call in a file
1294of key-value pairs with whitespace separator
1295
1296 overwrite 1
1297 token DeReKo#Structure
1298 ...
1299
1300Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001301C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001302C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001303C<output>,
1304C<temp-extract>, C<sequential-extraction>,
1305C<base-sentences>, C<base-paragraphs>,
1306C<base-pagebreaks>,
1307C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001308(semicolon separated), C<anno> (semicolon separated).
1309
Akronf73ffb62018-06-27 12:13:59 +02001310Configuration parameters will always be overwritten by
1311passed parameters.
1312
1313
Akron81500102017-04-07 20:45:44 +02001314=item B<--temporary-extract|-te>
1315
1316Only valid for the C<archive> command.
1317
1318This will first extract all files into a
1319directory and then will archive.
1320If the directory is given as C<:temp:>,
1321a temporary directory is used.
1322This is especially useful to avoid
1323massive unzipping and potential
1324network latency.
Akron636aa112017-04-07 18:48:56 +02001325
Akronf73ffb62018-06-27 12:13:59 +02001326
Akrone10ad322016-02-27 10:54:26 +01001327=item B<--sigle|-sg>
1328
Akron20807582016-10-26 17:11:34 +02001329Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001330Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001331I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001332Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001333In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001334On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001335
Akronf73ffb62018-06-27 12:13:59 +02001336
Akron941c1a62016-02-23 17:41:41 +01001337=item B<--log|-l>
1338
1339The L<Log4perl> log level, defaults to C<ERROR>.
1340
Akronf73ffb62018-06-27 12:13:59 +02001341
Akron941c1a62016-02-23 17:41:41 +01001342=item B<--help|-h>
1343
1344Print this document.
1345
Akronf73ffb62018-06-27 12:13:59 +02001346
Akron941c1a62016-02-23 17:41:41 +01001347=item B<--version|-v>
1348
1349Print version information.
1350
1351=back
1352
Akronf73ffb62018-06-27 12:13:59 +02001353
Akronc13a1702016-03-15 19:33:14 +01001354=head1 ANNOTATION SUPPORT
1355
1356L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1357developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1358The base foundry with paragraphs, sentences, and the text element are mandatory for
1359L<Krill|https://github.com/KorAP/Krill>.
1360
Akron821db3d2017-04-06 21:19:31 +02001361 Base
1362 #Paragraphs
1363 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001364
Akron821db3d2017-04-06 21:19:31 +02001365 Connexor
1366 #Morpho
1367 #Phrase
1368 #Sentences
1369 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001370
Akron821db3d2017-04-06 21:19:31 +02001371 CoreNLP
1372 #Constituency
1373 #Morpho
1374 #NamedEntities
1375 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001376
Akronce125b62017-06-19 11:54:36 +02001377 CMC
1378 #Morpho
1379
Akron821db3d2017-04-06 21:19:31 +02001380 DeReKo
1381 #Structure
Akronc13a1702016-03-15 19:33:14 +01001382
Akron821db3d2017-04-06 21:19:31 +02001383 DRuKoLa
1384 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001385
Akron821db3d2017-04-06 21:19:31 +02001386 Glemm
1387 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001388
Akronea1aed52018-07-19 14:43:34 +02001389 HNC
1390 #Morpho
1391
Akron4c679192018-01-16 17:41:49 +01001392 LWC
1393 #Dependency
1394
Akron821db3d2017-04-06 21:19:31 +02001395 Malt
1396 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001397
Akron821db3d2017-04-06 21:19:31 +02001398 MarMoT
1399 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001400
Akron821db3d2017-04-06 21:19:31 +02001401 Mate
1402 #Dependency
1403 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001404
Akron821db3d2017-04-06 21:19:31 +02001405 MDParser
1406 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001407
Akron821db3d2017-04-06 21:19:31 +02001408 OpenNLP
1409 #Morpho
1410 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001411
Akron821db3d2017-04-06 21:19:31 +02001412 Sgbr
1413 #Lemma
1414 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001415
Akron821db3d2017-04-06 21:19:31 +02001416 TreeTagger
1417 #Morpho
1418 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001419
Akron821db3d2017-04-06 21:19:31 +02001420 XIP
1421 #Constituency
1422 #Morpho
1423 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001424
Akronc13a1702016-03-15 19:33:14 +01001425
1426More importers are in preparation.
1427New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1428See the built-in annotation importers as examples.
1429
Akronf73ffb62018-06-27 12:13:59 +02001430
Akron941c1a62016-02-23 17:41:41 +01001431=head1 AVAILABILITY
1432
1433 https://github.com/KorAP/KorAP-XML-Krill
1434
1435
1436=head1 COPYRIGHT AND LICENSE
1437
Akron4c679192018-01-16 17:41:49 +01001438Copyright (C) 2015-2018, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001439
Akron941c1a62016-02-23 17:41:41 +01001440Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001441
Akrona76d8352016-10-27 16:27:32 +02001442Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001443
1444L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1445Corpus Analysis Platform at the
1446L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1447member of the
1448L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1449
1450This program is free software published under the
1451L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1452
1453=cut