blob: 7a7b8f7a66d5368bfc3826b955254231c147c3b2 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
27use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020029use IO::File;
30use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akron941c1a62016-02-23 17:41:41 +0100116# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100117
Akron9ec88872017-04-12 16:29:06 +0200118our $LAST_CHANGE = '2017/04/12';
Akron941c1a62016-02-23 17:41:41 +0100119our $LOCAL = $FindBin::Bin;
120our $VERSION_MSG = <<"VERSION";
121Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
122VERSION
123
Akron63f20d42017-04-10 23:40:29 +0200124# Prototypes
125sub get_file_name_from_glob($);
126sub get_file_name($);
127
Akron941c1a62016-02-23 17:41:41 +0100128# Parse comand
129my $cmd;
130our @ARGV;
131if ($ARGV[0] && index($ARGV[0], '-') != 0) {
132 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100133};
Akron63f20d42017-04-10 23:40:29 +0200134my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100135
Akron5f51d422016-08-16 16:26:43 +0200136my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100137my $text;
Akrone10ad322016-02-27 10:54:26 +0100138
Akron941c1a62016-02-23 17:41:41 +0100139# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000140GetOptions(
Akron08385f62016-03-22 20:37:04 +0100141 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200142 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100143 'output|o=s' => \(my $output),
144 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100145 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200146 'token|t=s' => \(my $token_base),
147 'base-sentences|bs=s' => \(my $base_sentences),
148 'base-paragraphs|bp=s' => \(my $base_paragraphs),
149 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100150 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200151 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100152 'skip|s=s' => \@skip,
153 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200154 'cache|c=s' => \(my $cache_file),
155 'config|cfg=s' => \(my $cfg_file),
156 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200157 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100158 'primary|p!' => \(my $primary),
159 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200160 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200161 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200162 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200163 'cache-size|cs=s' => \(my $cache_size),
164 'cache-delete|cd!' => \(my $cache_delete),
165 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100166 'help|h' => sub {
167 pod2usage(
168 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200169 -verbose => 99,
170 -msg => $VERSION_MSG,
171 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100172 );
173 },
174 'version|v' => sub {
175 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200176 -verbose => 0,
177 -msg => $VERSION_MSG,
178 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100179 )
180 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000181);
182
Akron63f20d42017-04-10 23:40:29 +0200183
Akron636aa112017-04-07 18:48:56 +0200184# Load from configuration
185if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200186 my %config;
187
188 Config::Simple->import_from($cfg_file, \%config);
189
190 # Overwrite
191 if (!defined($overwrite) && defined $config{overwrite}) {
192 $overwrite = $config{overwrite};
193 };
194
195 # Gzip
196 if (!defined($gzip) && defined $config{gzip}) {
197 $gzip = $config{gzip};
198 };
199
200 # Jobs
201 if (!defined($jobs) && defined $config{jobs}) {
202 $jobs = $config{jobs};
203 };
204
Akron63f20d42017-04-10 23:40:29 +0200205 # Input root base directory
206 if (!defined($input_base) && defined $config{'input-base'}) {
207 $input_base = $config{'input-base'};
208 };
209
Akron81500102017-04-07 20:45:44 +0200210 # temporary-extract
211 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
212 $extract_dir = $config{'temporary-extract'};
213 };
214
Akron636aa112017-04-07 18:48:56 +0200215 # Token base
216 if (!defined($token_base) && defined $config{token}) {
217 $token_base = $config{token};
218 };
219
220 # Cache file
221 if (!defined($cache_file) && defined $config{cache}) {
222 $cache_file = $config{cache};
223 };
224
225 # Cache size
226 if (!defined($cache_size) && defined $config{'cache-size'}) {
227 $cache_size = $config{'cache-size'};
228 };
229
230 # Cache delete
231 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
232 $cache_delete = $config{'cache-delete'} ;
233 };
234
235 # Cache init
236 if (!(defined $cache_init) && defined $config{'cache-init'}) {
237 $cache_init = $config{'cache-init'} ;
238 };
239
Akron9ec88872017-04-12 16:29:06 +0200240 # Jobs for extraction
241 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
242 $sequential_extraction = $config{'sequential-extraction'} ;
243 };
244
Akron636aa112017-04-07 18:48:56 +0200245 # Meta
246 if (!(defined $meta) && defined $config{'meta'}) {
247 $meta = $config{'meta'} ;
248 };
249
250 # Output
251 if (!(defined $output) && defined $config{'output'}) {
252 $output = $config{'output'} ;
253 };
254
255 # Base-sentences
256 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
257 $base_sentences = $config{'base-sentences'} ;
258 };
259
260 # Base-paragraphs
261 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
262 $base_paragraphs = $config{'base-paragraphs'} ;
263 };
264
265 # Base-pagebreaks
266 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
267 $base_pagebreaks = $config{'base-pagebreaks'} ;
268 };
269
Akron081639e2017-04-21 19:01:39 +0200270 # Write to tar
271 if (!(defined $to_tar) && defined $config{'to-tar'}) {
272 $to_tar = $config{'to-tar'} ;
273 };
274
Akron636aa112017-04-07 18:48:56 +0200275 # Log
276 if (!(defined $log_level) && defined $config{'log'}) {
277 $log_level = $config{'log'} ;
278 };
279
280 # Skip
281 if (!scalar(@skip) && defined $config{'skip'}) {
282 @skip = split /\s*;\s*/, $config{'skip'} ;
283 };
284
285 # Sigle
286 if (!scalar(@sigle) && defined $config{'sigle'}) {
287 @sigle = split /\s*;\s*/, $config{'sigle'} ;
288 };
289
290 # Anno
291 if (!scalar(@anno) && defined $config{'anno'}) {
292 @anno = split /\s*;\s*/, $config{'anno'} ;
293 };
294};
295
Akron63f20d42017-04-10 23:40:29 +0200296
Akron636aa112017-04-07 18:48:56 +0200297# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200298$token_base //= 'OpenNLP#tokens';
299$cache_file //= 'korapxml2krill.cache';
300$cache_size //= '50m';
301$jobs //= 0;
302$cache_delete //= 1;
303$cache_init //= 1;
304$sequential_extraction //= 0;
305$log_level //= 'ERROR';
306$base_sentences //= '';
307$base_paragraphs //= '';
308$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200309
Akron821db3d2017-04-06 21:19:31 +0200310$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100311$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100312$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100313
Akron63f20d42017-04-10 23:40:29 +0200314
315# Initialize log4perl object
316Log::Log4perl->init({
317 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
318 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
319 'log4perl.appender.STDERR.layout' => 'PatternLayout',
320 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
321});
322
323my $log = Log::Log4perl->get_logger('main');
324
325
326print "Reading config from $cfg_file\n" if $cfg_file;
327
328
Akron941c1a62016-02-23 17:41:41 +0100329my %ERROR_HASH = (
330 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200331 -verbose => 99,
332 -msg => $VERSION_MSG,
333 -output => '-',
334 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100335);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000336
Akron941c1a62016-02-23 17:41:41 +0100337# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100338pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000339
Akrone1dbc382016-07-08 22:24:52 +0200340# Gzip has no effect, if no output is given
341pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000342
Akronc11f7982017-02-21 21:20:14 +0100343
Akron636aa112017-04-07 18:48:56 +0200344if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100345 state $cores = Sys::Info->new->device('CPU')->count;
346 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200347 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100348};
349
Akron821db3d2017-04-06 21:19:31 +0200350
Akron63f20d42017-04-10 23:40:29 +0200351# Start serial processing
352if ($cmd eq 'serial') {
353
Akron486f9ab2017-04-22 23:25:19 +0200354 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron63f20d42017-04-10 23:40:29 +0200355 print "Directory '$output' does not exist.\n\n";
356 exit(0);
357 };
358
359 # Remove all inputs
360 my $remove_next = 0;
361 @keep_argv = @{c(@keep_argv)->grep(
362 sub {
363 # Input flag
364 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
365 $remove_next = 1;
366 return 0;
367 }
368
369 # input value
370 elsif ($remove_next) {
371 $remove_next = 0;
372 return 0;
373 };
374
375 # Pass parameter
376 return 1;
377 }
378 )->to_array};
379
380
381 # Iterate over all inputs
382 foreach (@input) {
383
Akron081639e2017-04-21 19:01:39 +0200384 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200385 my $new_out = catdir($output, get_file_name_from_glob($_));
386
Akron486f9ab2017-04-22 23:25:19 +0200387 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200388 unless ($to_tar) {
389 if (make_path($new_out) == 0 && !-d $new_out) {
390 $log->error("Can\'t create path $new_out");
391 exit(0);
392 };
Akron63f20d42017-04-10 23:40:29 +0200393 };
394
395 # Create archive command
396 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
397 print "Start serial processing of $_ to $new_out\n";
398
399 # Start archiving
400 system @archive_cmd;
401 };
402
403 exit(0);
404};
405
Akrone1dbc382016-07-08 22:24:52 +0200406my %skip;
407$skip{lc($_)} = 1 foreach @skip;
408
409my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100410push(@layers, ['Base', 'Sentences']) unless $base_sentences;
411push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200412
413# Connexor
414push(@layers, ['Connexor', 'Morpho']);
415push(@layers, ['Connexor', 'Syntax']);
416push(@layers, ['Connexor', 'Phrase']);
417push(@layers, ['Connexor', 'Sentences']);
418
419# CoreNLP
420push(@layers, ['CoreNLP', 'NamedEntities']);
421push(@layers, ['CoreNLP', 'Sentences']);
422push(@layers, ['CoreNLP', 'Morpho']);
423push(@layers, ['CoreNLP', 'Constituency']);
424
Akron3741f8b2016-12-21 19:55:21 +0100425
Akrone1dbc382016-07-08 22:24:52 +0200426# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100427my @dereko_attr = ();
428if ($base_sentences eq 'dereko#structure') {
429 push @dereko_attr, 'sentences';
430};
431if ($base_paragraphs eq 'dereko#structure') {
432 push @dereko_attr, 'paragraphs';
433};
Akron636bd9c2017-02-09 17:13:00 +0100434
Akron41ac10b2017-02-08 22:47:25 +0100435if ($base_pagebreaks eq 'dereko#structure') {
436 push @dereko_attr, 'pagebreaks';
437};
438
439if ($dereko_attr[0]) {
440 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100441}
442else {
443 push(@layers, ['DeReKo', 'Structure']);
444};
Akrone1dbc382016-07-08 22:24:52 +0200445
446# Glemm
447push(@layers, ['Glemm', 'Morpho']);
448
449# Malt
450push(@layers, ['Malt', 'Dependency']);
451
452# MDParser
453push(@layers, ['MDParser', 'Dependency']);
454
455# Mate
456push(@layers, ['Mate', 'Morpho']);
457push(@layers, ['Mate', 'Dependency']);
458
459# OpenNLP
460push(@layers, ['OpenNLP', 'Morpho']);
461push(@layers, ['OpenNLP', 'Sentences']);
462
463# Schreibgebrauch
464push(@layers, ['Sgbr', 'Lemma']);
465push(@layers, ['Sgbr', 'Morpho']);
466
467# TreeTagger
468push(@layers, ['TreeTagger', 'Morpho']);
469push(@layers, ['TreeTagger', 'Sentences']);
470
471# XIP
472push(@layers, ['XIP', 'Morpho']);
473push(@layers, ['XIP', 'Constituency']);
474push(@layers, ['XIP', 'Sentences']);
475push(@layers, ['XIP', 'Dependency']);
476
Akron4fa37c32017-01-20 14:43:10 +0100477# DRuKoLa
478push(@layers, ['DRuKoLa', 'Morpho']);
479
Akron3bd942f2017-02-20 20:09:14 +0100480# Marmot
481push(@layers, ['MarMoT', 'Morpho']);
482
Akron4fa37c32017-01-20 14:43:10 +0100483
Akrone1dbc382016-07-08 22:24:52 +0200484# Check filters
485my @filtered_anno;
486if ($skip{'#all'}) {
487 foreach (@anno) {
488 push @filtered_anno, [ split('#', $_) ];
489 };
490}
491
492# Add all annotations that are not skipped
493else {
494 # Add to index file - respect skipping
495 foreach my $info (@layers) {
496 # Skip if Foundry or Foundry#Layer should be skipped
497 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
498 push @filtered_anno, $info;
499 };
500 };
501};
502
503# Get tokenization basis
504my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
505
506# TODO: This should not be initialized for batch
507my $cache = Cache::FastMmap->new(
508 share_file => $cache_file,
509 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200510 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200511);
512
Akron03b24db2016-08-16 20:54:32 +0200513# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200514my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200515 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200516 meta_type => $meta,
517 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200518 foundry => $token_base_foundry,
519 layer => $token_base_layer,
520 gzip => $gzip,
521 log => $log,
522 primary => $primary,
523 pretty => $pretty,
524 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200525);
526
Akron941c1a62016-02-23 17:41:41 +0100527# Get file name based on path information
528sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100529 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200530 if (-d $i) {
531 $i =~ s![^\/]+$!!;
532 };
Akron941c1a62016-02-23 17:41:41 +0100533 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200534
535 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200536 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100537 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100538 $file =~ tr/\//-/;
539 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200540 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100541 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000542};
543
Akron63f20d42017-04-10 23:40:29 +0200544
545sub get_file_name_from_glob ($) {
546 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200547 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200548 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
549 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
550 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
551 $glob =~ s/^-//; # Clean beginning
552 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200553 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200554 return $glob;
555};
556
557
Akrone10ad322016-02-27 10:54:26 +0100558# Convert sigle to path construct
559s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
560
Akron7d4cdd82016-08-17 21:39:45 +0200561if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200562 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron7d4cdd82016-08-17 21:39:45 +0200563 print "Directory '$output' does not exist.\n\n";
564 exit(0);
565 };
566};
567
Akron63f20d42017-04-10 23:40:29 +0200568
569# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200570if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200571
Akron821db3d2017-04-06 21:19:31 +0200572 my @new_input = ();
573
574 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200575 foreach my $wild_card (@input) {
576
577 # Prefix with input root
578 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
579
580 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200581 };
582
Akron63f20d42017-04-10 23:40:29 +0200583 # Sort files by length
584 @input = sort { length($a) <=> length($b) } @new_input;
585
586 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200587};
588
589
Akron941c1a62016-02-23 17:41:41 +0100590# Process a single file
591unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100592 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000593
Akron941c1a62016-02-23 17:41:41 +0100594 BEGIN {
595 $main::TIME = Benchmark->new;
596 $main::LAST_STOP = Benchmark->new;
597 };
598
599 sub stop_time {
600 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200601 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100602 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200603 timestr(timediff($new, $main::LAST_STOP)) .
604 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
605 );
Akron941c1a62016-02-23 17:41:41 +0100606 $main::LAST_STOP = $new;
607 };
608
609 # Create and parse new document
610 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100611
Akron7d4cdd82016-08-17 21:39:45 +0200612 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200613 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100614
Akron11c80302016-03-18 19:44:43 +0100615 # Delete cache file
616 unlink($cache_file) if $cache_delete;
617
Akron5f51d422016-08-16 16:26:43 +0200618 stop_time;
Akron81500102017-04-07 20:45:44 +0200619 exit(1);
620};
621
Nils Diewald59094f22014-11-05 18:20:50 +0000622
Akrone10ad322016-02-27 10:54:26 +0100623# Extract XML files
Akron81500102017-04-07 20:45:44 +0200624if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100625
Akron7d4cdd82016-08-17 21:39:45 +0200626 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200627 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100628
Akron7d4cdd82016-08-17 21:39:45 +0200629 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100630 unless ($archive->test_unzip) {
631 print "Unzip is not installed or incompatible.\n\n";
Akron81500102017-04-07 20:45:44 +0200632 exit(0);
Akrone10ad322016-02-27 10:54:26 +0100633 };
634
Akronb0c88db2016-06-29 16:33:18 +0200635 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200636 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200637
Akron651cb8d2016-08-16 21:44:49 +0200638 my $prefix = 1;
639
Akron03b24db2016-08-16 20:54:32 +0200640 # No sigles given
641 unless (@sigle) {
642
643 # Get files
644 foreach ($archive->list_texts) {
645
646 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200647 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200648
649 # TODO: Make this OS independent
650 push @sigle, join '/', $corpus, $doc, $text;
651 };
Akron20807582016-10-26 17:11:34 +0200652 }
653
654 # Check sigle for doc sigles
655 else {
656 my @new_sigle;
657
658 my $prefix_check = 0;
659
660 # Iterate over all sigle
661 foreach (@sigle) {
662
663 # Sigle is a doc sigle
664 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200665
Akron60a8caa2017-02-17 21:51:27 +0100666 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200667 # Check if a prefix is needed
668 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100669
670 if ($prefix = $archive->check_prefix) {
671 print " with prefix ...";
672 };
Akron20807582016-10-26 17:11:34 +0200673 $prefix_check = 1;
674 };
675
Akron60a8caa2017-02-17 21:51:27 +0100676 print "\n";
677
Akron20807582016-10-26 17:11:34 +0200678 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200679 my $path = ($prefix ? './' : '') . $_;
680
681 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200682 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200683 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200684 ) ? '' : 'not '
685 );
686 print "extracted.\n";
687 }
Akron60a8caa2017-02-17 21:51:27 +0100688
689 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200690 else {
691 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100692
693 unless ($prefix_check) {
694
695 if ($prefix = $archive->check_prefix) {
696 print " with prefix ...";
697 };
698 $prefix_check = 1;
699 };
Akron20807582016-10-26 17:11:34 +0200700 };
701 };
702 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200703 };
704
Akrone10ad322016-02-27 10:54:26 +0100705 # Iterate over all given sigles and extract
706 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100707
Akron2812ba22016-10-28 21:55:59 +0200708 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200709
Akron03b24db2016-08-16 20:54:32 +0200710 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200711 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100712
Akron20807582016-10-26 17:11:34 +0200713 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200714 ($prefix ? './' : '') . $_, $output
715 ) ? '' : 'not '
716 );
Akrone10ad322016-02-27 10:54:26 +0100717 print "extracted.\n";
718 };
Akronb0c88db2016-06-29 16:33:18 +0200719 }
Akron7d4cdd82016-08-17 21:39:45 +0200720
721 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200722 else {
723 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron81500102017-04-07 20:45:44 +0200724 exit(1);
Akrone10ad322016-02-27 10:54:26 +0100725 };
726}
727
Akron81500102017-04-07 20:45:44 +0200728
Akron941c1a62016-02-23 17:41:41 +0100729# Process an archive
730elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000731
Akron81500102017-04-07 20:45:44 +0200732 my $archive_output;
733
734 # First extract, then archive
735 if (defined $extract_dir) {
736
737 # Create new archive object
738 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
739
740 # Check zip capabilities
741 unless ($archive->test_unzip) {
742 print "Unzip is not installed or incompatible.\n\n";
743 exit(0);
744 };
745
746 # Add further annotation archived
747 $archive->attach($_) foreach @input[1..$#input];
748
749 # Create a temporary directory
750 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200751 $extract_dir = tempdir(CLEANUP => 0);
752 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200753 };
754
Akron63f20d42017-04-10 23:40:29 +0200755 # Add some random extra to avoid clashes with multiple archives
756 $extract_dir = catdir($extract_dir, random_string('cccccc'));
757
758 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200759 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200760 @input = ($extract_dir);
761 }
762 else {
763 $log->error('Unable to extract from primary archive ' . $input[0] .
764 ' to ' . $extract_dir);
765 exit(1);
766 };
767 }
768
769 # Can't create archive object
770 else {
771 $log->error('Unable to extract from primary archive ' . $input[0]);
772 exit(1);
773 };
774 };
775
Akrone1dbc382016-07-08 22:24:52 +0200776 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100777
Akron7d4cdd82016-08-17 21:39:45 +0200778 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100779 my $pool = Parallel::ForkManager->new($jobs);
780
Akron7d4cdd82016-08-17 21:39:45 +0200781 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100782 my $iter = 1; # Current text in process
783
Akronda3097e2017-04-23 19:53:57 +0200784 my $tar_archive;
785 my $output_dir = $output;
786 my $tar_fh;
787
788 # Initialize tar archive
789 if ($to_tar) {
790 $tar_archive = Archive::Tar::Builder->new(
791 ignore_errors => 1
792 );
793
794 # Set output name
795 my $tar_file = $output;
796 unless ($tar_file =~ /\.tar$/) {
797 $tar_file .= '.tar';
798 };
799
800 # Initiate the tar file
801 print "Writing to file $tar_file\n";
802 $tar_fh = IO::File->new($tar_file, 'w');
803 $tar_fh->binmode(1);
804
805 # Set handle
806 $tar_archive->set_handle($tar_fh);
807
808 # Output to temporary directory
809 $output_dir = File::Temp->newdir;
810 };
811
Akron941c1a62016-02-23 17:41:41 +0100812 # Report on fork message
813 $pool->run_on_finish (
814 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200815 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100816 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200817
Akron08385f62016-03-22 20:37:04 +0100818 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200819 ($iter++) . "/$count]" .
820 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200821 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200822
823 if (!$code && $to_tar && $data->[2]) {
824 my $filename = $data->[2];
825
826 # Lock filehandle
827 if (flock($tar_fh, LOCK_EX)) {
828
829 # Archive and remove file
830 $tar_archive->archive($filename);
831 unlink $filename;
832
833 # Unlock filehandle
834 flock($tar_fh, LOCK_UN);
835 }
836 else {
837 $log->warn("Unable to add $filename to archive");
838 };
839 };
840
Akron4c0cf312016-10-15 16:42:09 +0200841 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100842 }
843 );
844
845 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200846 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100847 print "Reading data ...\n";
848
Akron7d4cdd82016-08-17 21:39:45 +0200849 # unless (Cache::FastMmap->new(
850 # share_file => $cache_file,
851 # cache_size => $cache_size,
852 # init_file => $cache_init
853 # )) {
854 # print "Unable to intialize cache '$cache_file'\n\n";
855 # exit(1);
856 # };
Akron11c80302016-03-18 19:44:43 +0100857
Akron486f9ab2017-04-22 23:25:19 +0200858
Akron941c1a62016-02-23 17:41:41 +0100859 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100860 if (-d $input[0]) {
861 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100862 my @dirs;
863 my $dir;
864
Akron7d4cdd82016-08-17 21:39:45 +0200865 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100866 while (1) {
867 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200868 push @dirs, $dir;
869 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100870 };
871 last unless $it->next;
872 };
873
874 print "Start processing ...\n";
875 $t = Benchmark->new;
876 $count = scalar @dirs;
877
878 DIRECTORY_LOOP:
879 for (my $i = 0; $i < $count; $i++) {
880
Akrone1dbc382016-07-08 22:24:52 +0200881 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200882 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200883 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200884 );
Akron941c1a62016-02-23 17:41:41 +0100885
886 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200887 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200888
Akron13d56622016-10-31 14:54:49 +0100889 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200890 $pool->finish(
891 0,
Akronda3097e2017-04-23 19:53:57 +0200892 [
893 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
894 undef,
895 $filename
896 ]
Akron486f9ab2017-04-22 23:25:19 +0200897 );
Akron3ec48972016-08-17 23:24:52 +0200898 }
899 else {
Akron4c0cf312016-10-15 16:42:09 +0200900 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200901 };
Akron941c1a62016-02-23 17:41:41 +0100902 };
903 }
904
905 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200906 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200907
Akron941c1a62016-02-23 17:41:41 +0100908 unless ($archive->test_unzip) {
909 print "Unzip is not installed or incompatible.\n\n";
910 exit(1);
911 };
912
Akron08385f62016-03-22 20:37:04 +0100913 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200914 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100915
Akron941c1a62016-02-23 17:41:41 +0100916 print "Start processing ...\n";
917 $t = Benchmark->new;
918 my @dirs = $archive->list_texts;
919 $count = scalar @dirs;
920
921 ARCHIVE_LOOP:
922 for (my $i = 0; $i < $count; $i++) {
923
924 # Split path information
925 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
926
Akrone1dbc382016-07-08 22:24:52 +0200927 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200928 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200929 get_file_name(
930 catfile($corpus, $doc, $text)
931 . '.json' . ($gzip ? '.gz' : '')
932 )
Akrone1dbc382016-07-08 22:24:52 +0200933 );
Akron941c1a62016-02-23 17:41:41 +0100934
935 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200936 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100937
Akron4c0cf312016-10-15 16:42:09 +0200938 # Create temporary file
939 $temp = File::Temp->newdir;
940
Akronbdf434a2016-10-24 17:42:07 +0200941 # TODO: Check if $filename exist at the beginning,
942 # because extraction can be horrible slow!
943
Akron941c1a62016-02-23 17:41:41 +0100944 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200945 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100946
Akron7d4cdd82016-08-17 21:39:45 +0200947 # Create corpus directory
948 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100949
Akron7d4cdd82016-08-17 21:39:45 +0200950 # Temporary directory
951 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100952
Akron7d4cdd82016-08-17 21:39:45 +0200953 # Write file
Akron13d56622016-10-31 14:54:49 +0100954 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200955
Akron4c0cf312016-10-15 16:42:09 +0200956 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100957 $pool->finish(
958 0,
Akronda3097e2017-04-23 19:53:57 +0200959 [
960 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
961 $temp,
962 $filename
963 ]
Akron13d56622016-10-31 14:54:49 +0100964 );
965 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200966 }
967 else {
Akron4c0cf312016-10-15 16:42:09 +0200968 # Delete temporary file
969 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200970 };
Akron941c1a62016-02-23 17:41:41 +0100971 }
Akron7d4cdd82016-08-17 21:39:45 +0200972
973 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100974 else {
Akron4c0cf312016-10-15 16:42:09 +0200975 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100976 };
977 };
978 }
979
980 else {
981 print "Input is neither a directory nor an archive.\n\n";
982 };
983
984 $pool->wait_all_children;
985
Akron11c80302016-03-18 19:44:43 +0100986 # Delete cache file
987 unlink($cache_file) if $cache_delete;
988
Akronda3097e2017-04-23 19:53:57 +0200989 # Close tar filehandle
990 if ($to_tar && $tar_fh) {
991 $tar_archive->finish;
992 $tar_fh->close;
993 print "Wrote to tar archive.\n";
994 };
995
Akron63f20d42017-04-10 23:40:29 +0200996 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100997 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200998};
Akron941c1a62016-02-23 17:41:41 +0100999
Nils Diewald2db9ad02013-10-29 19:26:43 +00001000
Akron63f20d42017-04-10 23:40:29 +02001001# Cleanup temporary extraction directory
1002if ($extract_dir) {
1003 my $objects = remove_tree($extract_dir, { safe => 1 });
1004 print "Removed directory $extract_dir with $objects objects.\n";
1005};
1006
1007
1008print "\n";
1009
Nils Diewald2db9ad02013-10-29 19:26:43 +00001010__END__
Akron941c1a62016-02-23 17:41:41 +01001011
1012=pod
1013
1014=encoding utf8
1015
1016=head1 NAME
1017
Akronf7ad89e2016-03-16 18:22:47 +01001018korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001019
1020
1021=head1 SYNOPSIS
1022
Akrona76d8352016-10-27 16:27:32 +02001023 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001024
Akron2fd402b2016-10-27 21:26:48 +02001025
Akron941c1a62016-02-23 17:41:41 +01001026=head1 DESCRIPTION
1027
1028L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1029compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001030The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001031
1032
1033=head1 INSTALLATION
1034
1035The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1036
Akronaf386982016-10-12 00:33:25 +02001037 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001038
Akronc13a1702016-03-15 19:33:14 +01001039In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001040be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001041Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001042In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001043
1044=head1 ARGUMENTS
1045
Akrona76d8352016-10-27 16:27:32 +02001046 $ korapxml2krill -z --input <directory> --output <filename>
1047
1048Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001049It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001050
Akron941c1a62016-02-23 17:41:41 +01001051=over 2
1052
1053=item B<archive>
1054
Akron081639e2017-04-21 19:01:39 +02001055 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001056
Akron2fd402b2016-10-27 21:26:48 +02001057Converts an archive of KorAP-XML documents. It expects a directory
1058(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001059
1060=item B<extract>
1061
Akrona76d8352016-10-27 16:27:32 +02001062 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1063
1064Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001065
Akron63f20d42017-04-10 23:40:29 +02001066=item B<serial>
1067
1068 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1069
1070Convert archives sequentially. The inputs are not merged but treated
1071as they are (so they may be premerged or globs).
1072the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001073are created based on the archive name. In case the C<--to-tar> flag is given,
1074the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001075
1076
Akron941c1a62016-02-23 17:41:41 +01001077=back
1078
1079
1080=head1 OPTIONS
1081
1082=over 2
1083
Akrona76d8352016-10-27 16:27:32 +02001084=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001085
Akrona76d8352016-10-27 16:27:32 +02001086Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001087
Akron7606afa2016-10-25 16:23:49 +02001088Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001089document, while C<archive> expects a KorAP-XML corpus folder or a zip
1090file to batch process multiple files.
1091C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001092
Akrona76d8352016-10-27 16:27:32 +02001093C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001094that the first archive listed contains all primary data files
1095and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001096
Akron7606afa2016-10-25 16:23:49 +02001097 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001098
Akron821db3d2017-04-06 21:19:31 +02001099Input may also be defined using BSD glob wildcards.
1100
1101 -i 'file/news*.zip'
1102
1103The extended input array will be sorted in length order, so the shortest
1104path needs to contain all primary data files and all meta data files.
1105
Akron0c3e3752016-06-28 15:55:53 +02001106(The directory structure follows the base directory format,
1107that may include a C<.> root folder.
1108In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001109need to be passed with a hash sign in front of the archive's name.
1110This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001111
Akron7606afa2016-10-25 16:23:49 +02001112To support zip files, a version of C<unzip> needs to be installed that is
1113compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001114
Akron7606afa2016-10-25 16:23:49 +02001115B<The root folder switch using the hash sign is experimental and
1116may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001117
Akron63f20d42017-04-10 23:40:29 +02001118=item B<--input-base|-ib> <directory>
1119
1120The base directory for inputs.
1121
1122
Akron941c1a62016-02-23 17:41:41 +01001123=item B<--output|-o> <directory|file>
1124
1125Output folder for archive processing or
1126document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001127writes to C<STDOUT> by default
1128(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001129
1130=item B<--overwrite|-w>
1131
1132Overwrite files that already exist.
1133
Akron3741f8b2016-12-21 19:55:21 +01001134=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001135
1136Define the default tokenization by specifying
1137the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001138of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001139
Akron3741f8b2016-12-21 19:55:21 +01001140
1141=item B<--base-sentences|-bs> <foundry>#<layer>
1142
1143Define the layer for base sentences.
1144If given, this will be used instead of using C<Base#Sentences>.
1145Currently C<DeReKo#Structure> is the only additional layer supported.
1146
1147 Defaults to unset.
1148
1149
1150=item B<--base-paragraphs|-bp> <foundry>#<layer>
1151
1152Define the layer for base paragraphs.
1153If given, this will be used instead of using C<Base#Paragraphs>.
1154Currently C<DeReKo#Structure> is the only additional layer supported.
1155
1156 Defaults to unset.
1157
1158
Akron41ac10b2017-02-08 22:47:25 +01001159=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1160
1161Define the layer for base pagebreaks.
1162Currently C<DeReKo#Structure> is the only layer supported.
1163
1164 Defaults to unset.
1165
1166
Akron941c1a62016-02-23 17:41:41 +01001167=item B<--skip|-s> <foundry>[#<layer>]
1168
Akronf7ad89e2016-03-16 18:22:47 +01001169Skip specific annotations by specifying the foundry
1170(and optionally the layer with a C<#>-prefix),
1171e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001172Can be set multiple times.
1173
Akronc13a1702016-03-15 19:33:14 +01001174=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001175
Akronf7ad89e2016-03-16 18:22:47 +01001176Convert specific annotations by specifying the foundry
1177(and optionally the layer with a C<#>-prefix),
1178e.g. C<Mate> or C<Mate#Morpho>.
1179Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001180
1181=item B<--primary|-p>
1182
Akronc13a1702016-03-15 19:33:14 +01001183Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001184Can be flagged using C<--no-primary> as well.
1185This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001186
1187=item B<--jobs|-j>
1188
1189Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001190for archive processing.
Akron11c80302016-03-18 19:44:43 +01001191Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001192
1193If C<sequential-extraction> is not set to false, this will
1194also apply to extraction.
1195
Akronc11f7982017-02-21 21:20:14 +01001196Pass -1, and the value will be set automatically to 5
1197times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001198This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001199
Akron9ec88872017-04-12 16:29:06 +02001200=item B<--sequential-extraction|-se>
1201
1202Flag to indicate, if the C<jobs> value also applies to extraction.
1203Some systems may have problems with extracting multiple archives
1204to the same folder at the same time.
1205Can be flagged using C<--no-sequential-extraction> as well.
1206Defaults to C<false>.
1207
Akron35db6e32016-03-17 22:42:22 +01001208=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001209
Akron35db6e32016-03-17 22:42:22 +01001210Define the metadata parser to use. Defaults to C<I5>.
1211Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1212This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001213
1214=item B<--pretty|-y>
1215
Akronc13a1702016-03-15 19:33:14 +01001216Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001217This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001218
1219=item B<--gzip|-z>
1220
Akronf7ad89e2016-03-16 18:22:47 +01001221Compress the output.
1222Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001223
Akron11c80302016-03-18 19:44:43 +01001224=item B<--cache|-c>
1225
1226File to mmap a cache (using L<Cache::FastMmap>).
1227Defaults to C<korapxml2krill.cache> in the calling directory.
1228
1229=item B<--cache-size|-cs>
1230
1231Size of the cache. Defaults to C<50m>.
1232
1233=item B<--cache-init|-ci>
1234
1235Initialize cache file.
1236Can be flagged using C<--no-cache-init> as well.
1237Defaults to C<true>.
1238
1239=item B<--cache-delete|-cd>
1240
1241Delete cache file after processing.
1242Can be flagged using C<--no-cache-delete> as well.
1243Defaults to C<true>.
1244
Akron636aa112017-04-07 18:48:56 +02001245=item B<--config|-cfg>
1246
1247Configure the parameters of your call in a file
1248of key-value pairs with whitespace separator
1249
1250 overwrite 1
1251 token DeReKo#Structure
1252 ...
1253
1254Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001255C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001256C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001257C<output>,
1258C<temp-extract>, C<sequential-extraction>,
1259C<base-sentences>, C<base-paragraphs>,
1260C<base-pagebreaks>,
1261C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001262(semicolon separated), C<anno> (semicolon separated).
1263
Akron81500102017-04-07 20:45:44 +02001264=item B<--temporary-extract|-te>
1265
1266Only valid for the C<archive> command.
1267
1268This will first extract all files into a
1269directory and then will archive.
1270If the directory is given as C<:temp:>,
1271a temporary directory is used.
1272This is especially useful to avoid
1273massive unzipping and potential
1274network latency.
Akron636aa112017-04-07 18:48:56 +02001275
Akrone10ad322016-02-27 10:54:26 +01001276=item B<--sigle|-sg>
1277
Akron20807582016-10-26 17:11:34 +02001278Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001279Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001280I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001281Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001282In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001283On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001284
Akron941c1a62016-02-23 17:41:41 +01001285=item B<--log|-l>
1286
1287The L<Log4perl> log level, defaults to C<ERROR>.
1288
1289=item B<--help|-h>
1290
1291Print this document.
1292
1293=item B<--version|-v>
1294
1295Print version information.
1296
1297=back
1298
Akronc13a1702016-03-15 19:33:14 +01001299=head1 ANNOTATION SUPPORT
1300
1301L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1302developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1303The base foundry with paragraphs, sentences, and the text element are mandatory for
1304L<Krill|https://github.com/KorAP/Krill>.
1305
Akron821db3d2017-04-06 21:19:31 +02001306 Base
1307 #Paragraphs
1308 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001309
Akron821db3d2017-04-06 21:19:31 +02001310 Connexor
1311 #Morpho
1312 #Phrase
1313 #Sentences
1314 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001315
Akron821db3d2017-04-06 21:19:31 +02001316 CoreNLP
1317 #Constituency
1318 #Morpho
1319 #NamedEntities
1320 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001321
Akron821db3d2017-04-06 21:19:31 +02001322 DeReKo
1323 #Structure
Akronc13a1702016-03-15 19:33:14 +01001324
Akron821db3d2017-04-06 21:19:31 +02001325 DRuKoLa
1326 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001327
Akron821db3d2017-04-06 21:19:31 +02001328 Glemm
1329 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001330
Akron821db3d2017-04-06 21:19:31 +02001331 Malt
1332 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001333
Akron821db3d2017-04-06 21:19:31 +02001334 MarMoT
1335 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001336
Akron821db3d2017-04-06 21:19:31 +02001337 Mate
1338 #Dependency
1339 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001340
Akron821db3d2017-04-06 21:19:31 +02001341 MDParser
1342 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001343
Akron821db3d2017-04-06 21:19:31 +02001344 OpenNLP
1345 #Morpho
1346 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001347
Akron821db3d2017-04-06 21:19:31 +02001348 Sgbr
1349 #Lemma
1350 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001351
Akron821db3d2017-04-06 21:19:31 +02001352 TreeTagger
1353 #Morpho
1354 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001355
Akron821db3d2017-04-06 21:19:31 +02001356 XIP
1357 #Constituency
1358 #Morpho
1359 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001360
Akronc13a1702016-03-15 19:33:14 +01001361
1362More importers are in preparation.
1363New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1364See the built-in annotation importers as examples.
1365
Akron941c1a62016-02-23 17:41:41 +01001366=head1 AVAILABILITY
1367
1368 https://github.com/KorAP/KorAP-XML-Krill
1369
1370
1371=head1 COPYRIGHT AND LICENSE
1372
Akron3ec0a1c2017-01-18 14:41:55 +01001373Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001374
Akron941c1a62016-02-23 17:41:41 +01001375Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001376
Akrona76d8352016-10-27 16:27:32 +02001377Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001378
1379L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1380Corpus Analysis Platform at the
1381L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1382member of the
1383L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1384
1385This program is free software published under the
1386L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1387
1388=cut