blob: b1838152ece8bbef7684dd32e3c7b2f8a0125ac1 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
27use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020029use IO::File;
30use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron941c1a62016-02-23 17:41:41 +0100119# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100120
Akronce125b62017-06-19 11:54:36 +0200121our $LAST_CHANGE = '2017/06/19';
Akron941c1a62016-02-23 17:41:41 +0100122our $LOCAL = $FindBin::Bin;
123our $VERSION_MSG = <<"VERSION";
124Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
125VERSION
126
Akron63f20d42017-04-10 23:40:29 +0200127# Prototypes
128sub get_file_name_from_glob($);
129sub get_file_name($);
130
Akron941c1a62016-02-23 17:41:41 +0100131# Parse comand
132my $cmd;
133our @ARGV;
134if ($ARGV[0] && index($ARGV[0], '-') != 0) {
135 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100136};
Akron63f20d42017-04-10 23:40:29 +0200137my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100138
Akron5f51d422016-08-16 16:26:43 +0200139my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100140my $text;
Akrone10ad322016-02-27 10:54:26 +0100141
Akron941c1a62016-02-23 17:41:41 +0100142# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143GetOptions(
Akron08385f62016-03-22 20:37:04 +0100144 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200145 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100146 'output|o=s' => \(my $output),
147 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100148 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200149 'token|t=s' => \(my $token_base),
150 'base-sentences|bs=s' => \(my $base_sentences),
151 'base-paragraphs|bp=s' => \(my $base_paragraphs),
152 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100153 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200154 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100155 'skip|s=s' => \@skip,
156 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200157 'cache|c=s' => \(my $cache_file),
158 'config|cfg=s' => \(my $cfg_file),
159 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200160 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100161 'primary|p!' => \(my $primary),
162 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200163 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200164 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200165 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200166 'cache-size|cs=s' => \(my $cache_size),
167 'cache-delete|cd!' => \(my $cache_delete),
168 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100169 'help|h' => sub {
170 pod2usage(
171 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200172 -verbose => 99,
173 -msg => $VERSION_MSG,
174 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100175 );
176 },
177 'version|v' => sub {
178 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200179 -verbose => 0,
180 -msg => $VERSION_MSG,
181 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100182 )
183 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000184);
185
Akron63f20d42017-04-10 23:40:29 +0200186
Akron636aa112017-04-07 18:48:56 +0200187# Load from configuration
188if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200189 my %config;
190
191 Config::Simple->import_from($cfg_file, \%config);
192
193 # Overwrite
194 if (!defined($overwrite) && defined $config{overwrite}) {
195 $overwrite = $config{overwrite};
196 };
197
198 # Gzip
199 if (!defined($gzip) && defined $config{gzip}) {
200 $gzip = $config{gzip};
201 };
202
203 # Jobs
204 if (!defined($jobs) && defined $config{jobs}) {
205 $jobs = $config{jobs};
206 };
207
Akron63f20d42017-04-10 23:40:29 +0200208 # Input root base directory
209 if (!defined($input_base) && defined $config{'input-base'}) {
210 $input_base = $config{'input-base'};
211 };
212
Akron81500102017-04-07 20:45:44 +0200213 # temporary-extract
214 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
215 $extract_dir = $config{'temporary-extract'};
216 };
217
Akron636aa112017-04-07 18:48:56 +0200218 # Token base
219 if (!defined($token_base) && defined $config{token}) {
220 $token_base = $config{token};
221 };
222
223 # Cache file
224 if (!defined($cache_file) && defined $config{cache}) {
225 $cache_file = $config{cache};
226 };
227
228 # Cache size
229 if (!defined($cache_size) && defined $config{'cache-size'}) {
230 $cache_size = $config{'cache-size'};
231 };
232
233 # Cache delete
234 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
235 $cache_delete = $config{'cache-delete'} ;
236 };
237
238 # Cache init
239 if (!(defined $cache_init) && defined $config{'cache-init'}) {
240 $cache_init = $config{'cache-init'} ;
241 };
242
Akron9ec88872017-04-12 16:29:06 +0200243 # Jobs for extraction
244 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
245 $sequential_extraction = $config{'sequential-extraction'} ;
246 };
247
Akron636aa112017-04-07 18:48:56 +0200248 # Meta
249 if (!(defined $meta) && defined $config{'meta'}) {
250 $meta = $config{'meta'} ;
251 };
252
253 # Output
254 if (!(defined $output) && defined $config{'output'}) {
255 $output = $config{'output'} ;
256 };
257
258 # Base-sentences
259 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
260 $base_sentences = $config{'base-sentences'} ;
261 };
262
263 # Base-paragraphs
264 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
265 $base_paragraphs = $config{'base-paragraphs'} ;
266 };
267
268 # Base-pagebreaks
269 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
270 $base_pagebreaks = $config{'base-pagebreaks'} ;
271 };
272
Akron081639e2017-04-21 19:01:39 +0200273 # Write to tar
274 if (!(defined $to_tar) && defined $config{'to-tar'}) {
275 $to_tar = $config{'to-tar'} ;
276 };
277
Akron636aa112017-04-07 18:48:56 +0200278 # Log
279 if (!(defined $log_level) && defined $config{'log'}) {
280 $log_level = $config{'log'} ;
281 };
282
283 # Skip
284 if (!scalar(@skip) && defined $config{'skip'}) {
285 @skip = split /\s*;\s*/, $config{'skip'} ;
286 };
287
288 # Sigle
289 if (!scalar(@sigle) && defined $config{'sigle'}) {
290 @sigle = split /\s*;\s*/, $config{'sigle'} ;
291 };
292
293 # Anno
294 if (!scalar(@anno) && defined $config{'anno'}) {
295 @anno = split /\s*;\s*/, $config{'anno'} ;
296 };
297};
298
Akron63f20d42017-04-10 23:40:29 +0200299
Akron636aa112017-04-07 18:48:56 +0200300# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200301$token_base //= 'OpenNLP#tokens';
302$cache_file //= 'korapxml2krill.cache';
303$cache_size //= '50m';
304$jobs //= 0;
305$cache_delete //= 1;
306$cache_init //= 1;
307$sequential_extraction //= 0;
308$log_level //= 'ERROR';
309$base_sentences //= '';
310$base_paragraphs //= '';
311$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200312
Akron821db3d2017-04-06 21:19:31 +0200313$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100314$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100315$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100316
Akron63f20d42017-04-10 23:40:29 +0200317
318# Initialize log4perl object
319Log::Log4perl->init({
320 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
321 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
322 'log4perl.appender.STDERR.layout' => 'PatternLayout',
323 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
324});
325
326my $log = Log::Log4perl->get_logger('main');
327
328
329print "Reading config from $cfg_file\n" if $cfg_file;
330
331
Akron941c1a62016-02-23 17:41:41 +0100332my %ERROR_HASH = (
333 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200334 -verbose => 99,
335 -msg => $VERSION_MSG,
336 -output => '-',
337 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100338);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000339
Akron941c1a62016-02-23 17:41:41 +0100340# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100341pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000342
Akrone1dbc382016-07-08 22:24:52 +0200343# Gzip has no effect, if no output is given
344pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000345
Akronc11f7982017-02-21 21:20:14 +0100346
Akron636aa112017-04-07 18:48:56 +0200347if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100348 state $cores = Sys::Info->new->device('CPU')->count;
349 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200350 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100351};
352
Akron821db3d2017-04-06 21:19:31 +0200353
Akron63f20d42017-04-10 23:40:29 +0200354# Start serial processing
355if ($cmd eq 'serial') {
356
Akron486f9ab2017-04-22 23:25:19 +0200357 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron63f20d42017-04-10 23:40:29 +0200358 print "Directory '$output' does not exist.\n\n";
359 exit(0);
360 };
361
362 # Remove all inputs
363 my $remove_next = 0;
364 @keep_argv = @{c(@keep_argv)->grep(
365 sub {
366 # Input flag
367 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
368 $remove_next = 1;
369 return 0;
370 }
371
372 # input value
373 elsif ($remove_next) {
374 $remove_next = 0;
375 return 0;
376 };
377
378 # Pass parameter
379 return 1;
380 }
381 )->to_array};
382
383
384 # Iterate over all inputs
385 foreach (@input) {
386
Akron081639e2017-04-21 19:01:39 +0200387 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200388 my $new_out = catdir($output, get_file_name_from_glob($_));
389
Akron486f9ab2017-04-22 23:25:19 +0200390 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200391 unless ($to_tar) {
392 if (make_path($new_out) == 0 && !-d $new_out) {
393 $log->error("Can\'t create path $new_out");
394 exit(0);
395 };
Akron63f20d42017-04-10 23:40:29 +0200396 };
397
398 # Create archive command
399 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
400 print "Start serial processing of $_ to $new_out\n";
401
402 # Start archiving
403 system @archive_cmd;
404 };
405
406 exit(0);
407};
408
Akrone1dbc382016-07-08 22:24:52 +0200409my %skip;
410$skip{lc($_)} = 1 foreach @skip;
411
412my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100413push(@layers, ['Base', 'Sentences']) unless $base_sentences;
414push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200415
416# Connexor
417push(@layers, ['Connexor', 'Morpho']);
418push(@layers, ['Connexor', 'Syntax']);
419push(@layers, ['Connexor', 'Phrase']);
420push(@layers, ['Connexor', 'Sentences']);
421
422# CoreNLP
423push(@layers, ['CoreNLP', 'NamedEntities']);
424push(@layers, ['CoreNLP', 'Sentences']);
425push(@layers, ['CoreNLP', 'Morpho']);
426push(@layers, ['CoreNLP', 'Constituency']);
427
Akronce125b62017-06-19 11:54:36 +0200428# CMC
429push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100430
Akrone1dbc382016-07-08 22:24:52 +0200431# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100432my @dereko_attr = ();
433if ($base_sentences eq 'dereko#structure') {
434 push @dereko_attr, 'sentences';
435};
436if ($base_paragraphs eq 'dereko#structure') {
437 push @dereko_attr, 'paragraphs';
438};
Akron636bd9c2017-02-09 17:13:00 +0100439
Akron41ac10b2017-02-08 22:47:25 +0100440if ($base_pagebreaks eq 'dereko#structure') {
441 push @dereko_attr, 'pagebreaks';
442};
443
444if ($dereko_attr[0]) {
445 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100446}
447else {
448 push(@layers, ['DeReKo', 'Structure']);
449};
Akrone1dbc382016-07-08 22:24:52 +0200450
451# Glemm
452push(@layers, ['Glemm', 'Morpho']);
453
454# Malt
455push(@layers, ['Malt', 'Dependency']);
456
457# MDParser
458push(@layers, ['MDParser', 'Dependency']);
459
460# Mate
461push(@layers, ['Mate', 'Morpho']);
462push(@layers, ['Mate', 'Dependency']);
463
464# OpenNLP
465push(@layers, ['OpenNLP', 'Morpho']);
466push(@layers, ['OpenNLP', 'Sentences']);
467
468# Schreibgebrauch
469push(@layers, ['Sgbr', 'Lemma']);
470push(@layers, ['Sgbr', 'Morpho']);
471
472# TreeTagger
473push(@layers, ['TreeTagger', 'Morpho']);
474push(@layers, ['TreeTagger', 'Sentences']);
475
476# XIP
477push(@layers, ['XIP', 'Morpho']);
478push(@layers, ['XIP', 'Constituency']);
479push(@layers, ['XIP', 'Sentences']);
480push(@layers, ['XIP', 'Dependency']);
481
Akron4fa37c32017-01-20 14:43:10 +0100482# DRuKoLa
483push(@layers, ['DRuKoLa', 'Morpho']);
484
Akron3bd942f2017-02-20 20:09:14 +0100485# Marmot
486push(@layers, ['MarMoT', 'Morpho']);
487
Akron4fa37c32017-01-20 14:43:10 +0100488
Akrone1dbc382016-07-08 22:24:52 +0200489# Check filters
490my @filtered_anno;
491if ($skip{'#all'}) {
492 foreach (@anno) {
493 push @filtered_anno, [ split('#', $_) ];
494 };
495}
496
497# Add all annotations that are not skipped
498else {
499 # Add to index file - respect skipping
500 foreach my $info (@layers) {
501 # Skip if Foundry or Foundry#Layer should be skipped
502 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
503 push @filtered_anno, $info;
504 };
505 };
506};
507
508# Get tokenization basis
509my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
510
511# TODO: This should not be initialized for batch
512my $cache = Cache::FastMmap->new(
513 share_file => $cache_file,
514 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200515 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200516);
517
Akron03b24db2016-08-16 20:54:32 +0200518# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200519my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200520 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200521 meta_type => $meta,
522 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200523 foundry => $token_base_foundry,
524 layer => $token_base_layer,
525 gzip => $gzip,
526 log => $log,
527 primary => $primary,
528 pretty => $pretty,
529 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200530);
531
Akron941c1a62016-02-23 17:41:41 +0100532# Get file name based on path information
533sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100534 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200535 if (-d $i) {
536 $i =~ s![^\/]+$!!;
537 };
Akron941c1a62016-02-23 17:41:41 +0100538 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200539
540 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200541 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100542 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100543 $file =~ tr/\//-/;
544 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200545 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100546 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000547};
548
Akron63f20d42017-04-10 23:40:29 +0200549
550sub get_file_name_from_glob ($) {
551 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200552 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200553 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
554 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
555 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
556 $glob =~ s/^-//; # Clean beginning
557 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200558 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200559 return $glob;
560};
561
562
Akrone10ad322016-02-27 10:54:26 +0100563# Convert sigle to path construct
564s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
565
Akron7d4cdd82016-08-17 21:39:45 +0200566if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200567 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron7d4cdd82016-08-17 21:39:45 +0200568 print "Directory '$output' does not exist.\n\n";
569 exit(0);
570 };
571};
572
Akron63f20d42017-04-10 23:40:29 +0200573
574# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200575if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200576
Akron821db3d2017-04-06 21:19:31 +0200577 my @new_input = ();
578
579 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200580 foreach my $wild_card (@input) {
581
582 # Prefix with input root
583 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
584
585 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200586 };
587
Akron63f20d42017-04-10 23:40:29 +0200588 # Sort files by length
589 @input = sort { length($a) <=> length($b) } @new_input;
590
591 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200592};
593
594
Akron941c1a62016-02-23 17:41:41 +0100595# Process a single file
596unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100597 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000598
Akron941c1a62016-02-23 17:41:41 +0100599 BEGIN {
600 $main::TIME = Benchmark->new;
601 $main::LAST_STOP = Benchmark->new;
602 };
603
604 sub stop_time {
605 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200606 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100607 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200608 timestr(timediff($new, $main::LAST_STOP)) .
609 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
610 );
Akron941c1a62016-02-23 17:41:41 +0100611 $main::LAST_STOP = $new;
612 };
613
614 # Create and parse new document
615 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100616
Akron7d4cdd82016-08-17 21:39:45 +0200617 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200618 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100619
Akron11c80302016-03-18 19:44:43 +0100620 # Delete cache file
621 unlink($cache_file) if $cache_delete;
622
Akron5f51d422016-08-16 16:26:43 +0200623 stop_time;
Akron81500102017-04-07 20:45:44 +0200624 exit(1);
625};
626
Nils Diewald59094f22014-11-05 18:20:50 +0000627
Akrone10ad322016-02-27 10:54:26 +0100628# Extract XML files
Akron81500102017-04-07 20:45:44 +0200629if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100630
Akron7d4cdd82016-08-17 21:39:45 +0200631 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200632 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100633
Akron7d4cdd82016-08-17 21:39:45 +0200634 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100635 unless ($archive->test_unzip) {
636 print "Unzip is not installed or incompatible.\n\n";
Akron81500102017-04-07 20:45:44 +0200637 exit(0);
Akrone10ad322016-02-27 10:54:26 +0100638 };
639
Akronb0c88db2016-06-29 16:33:18 +0200640 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200641 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200642
Akron651cb8d2016-08-16 21:44:49 +0200643 my $prefix = 1;
644
Akron03b24db2016-08-16 20:54:32 +0200645 # No sigles given
646 unless (@sigle) {
647
648 # Get files
649 foreach ($archive->list_texts) {
650
651 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200652 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200653
654 # TODO: Make this OS independent
655 push @sigle, join '/', $corpus, $doc, $text;
656 };
Akron20807582016-10-26 17:11:34 +0200657 }
658
659 # Check sigle for doc sigles
660 else {
661 my @new_sigle;
662
663 my $prefix_check = 0;
664
665 # Iterate over all sigle
666 foreach (@sigle) {
667
668 # Sigle is a doc sigle
669 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200670
Akron60a8caa2017-02-17 21:51:27 +0100671 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200672 # Check if a prefix is needed
673 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100674
675 if ($prefix = $archive->check_prefix) {
676 print " with prefix ...";
677 };
Akron20807582016-10-26 17:11:34 +0200678 $prefix_check = 1;
679 };
680
Akron60a8caa2017-02-17 21:51:27 +0100681 print "\n";
682
Akron20807582016-10-26 17:11:34 +0200683 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200684 my $path = ($prefix ? './' : '') . $_;
685
686 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200687 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200688 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200689 ) ? '' : 'not '
690 );
691 print "extracted.\n";
692 }
Akron60a8caa2017-02-17 21:51:27 +0100693
694 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200695 else {
696 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100697
698 unless ($prefix_check) {
699
700 if ($prefix = $archive->check_prefix) {
701 print " with prefix ...";
702 };
703 $prefix_check = 1;
704 };
Akron20807582016-10-26 17:11:34 +0200705 };
706 };
707 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200708 };
709
Akrone10ad322016-02-27 10:54:26 +0100710 # Iterate over all given sigles and extract
711 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100712
Akron2812ba22016-10-28 21:55:59 +0200713 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200714
Akron03b24db2016-08-16 20:54:32 +0200715 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200716 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100717
Akron20807582016-10-26 17:11:34 +0200718 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200719 ($prefix ? './' : '') . $_, $output
720 ) ? '' : 'not '
721 );
Akrone10ad322016-02-27 10:54:26 +0100722 print "extracted.\n";
723 };
Akronb0c88db2016-06-29 16:33:18 +0200724 }
Akron7d4cdd82016-08-17 21:39:45 +0200725
726 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200727 else {
728 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron81500102017-04-07 20:45:44 +0200729 exit(1);
Akrone10ad322016-02-27 10:54:26 +0100730 };
731}
732
Akron81500102017-04-07 20:45:44 +0200733
Akron941c1a62016-02-23 17:41:41 +0100734# Process an archive
735elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000736
Akron81500102017-04-07 20:45:44 +0200737 my $archive_output;
738
739 # First extract, then archive
740 if (defined $extract_dir) {
741
742 # Create new archive object
743 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
744
745 # Check zip capabilities
746 unless ($archive->test_unzip) {
747 print "Unzip is not installed or incompatible.\n\n";
748 exit(0);
749 };
750
751 # Add further annotation archived
752 $archive->attach($_) foreach @input[1..$#input];
753
754 # Create a temporary directory
755 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200756 $extract_dir = tempdir(CLEANUP => 0);
757 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200758 };
759
Akron63f20d42017-04-10 23:40:29 +0200760 # Add some random extra to avoid clashes with multiple archives
761 $extract_dir = catdir($extract_dir, random_string('cccccc'));
762
763 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200764 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200765 @input = ($extract_dir);
766 }
767 else {
768 $log->error('Unable to extract from primary archive ' . $input[0] .
769 ' to ' . $extract_dir);
770 exit(1);
771 };
772 }
773
774 # Can't create archive object
775 else {
776 $log->error('Unable to extract from primary archive ' . $input[0]);
777 exit(1);
778 };
779 };
780
Akrone1dbc382016-07-08 22:24:52 +0200781 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100782
Akron7d4cdd82016-08-17 21:39:45 +0200783 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100784 my $pool = Parallel::ForkManager->new($jobs);
785
Akron7d4cdd82016-08-17 21:39:45 +0200786 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100787 my $iter = 1; # Current text in process
788
Akronda3097e2017-04-23 19:53:57 +0200789 my $tar_archive;
790 my $output_dir = $output;
791 my $tar_fh;
792
793 # Initialize tar archive
794 if ($to_tar) {
795 $tar_archive = Archive::Tar::Builder->new(
796 ignore_errors => 1
797 );
798
799 # Set output name
800 my $tar_file = $output;
801 unless ($tar_file =~ /\.tar$/) {
802 $tar_file .= '.tar';
803 };
804
805 # Initiate the tar file
806 print "Writing to file $tar_file\n";
807 $tar_fh = IO::File->new($tar_file, 'w');
808 $tar_fh->binmode(1);
809
810 # Set handle
811 $tar_archive->set_handle($tar_fh);
812
813 # Output to temporary directory
814 $output_dir = File::Temp->newdir;
815 };
816
Akron941c1a62016-02-23 17:41:41 +0100817 # Report on fork message
818 $pool->run_on_finish (
819 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200820 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100821 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200822
Akron08385f62016-03-22 20:37:04 +0100823 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200824 ($iter++) . "/$count]" .
825 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200826 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200827
828 if (!$code && $to_tar && $data->[2]) {
829 my $filename = $data->[2];
830
831 # Lock filehandle
832 if (flock($tar_fh, LOCK_EX)) {
833
834 # Archive and remove file
835 $tar_archive->archive($filename);
836 unlink $filename;
837
838 # Unlock filehandle
839 flock($tar_fh, LOCK_UN);
840 }
841 else {
842 $log->warn("Unable to add $filename to archive");
843 };
844 };
845
Akron4c0cf312016-10-15 16:42:09 +0200846 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100847 }
848 );
849
850 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200851 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100852 print "Reading data ...\n";
853
Akron7d4cdd82016-08-17 21:39:45 +0200854 # unless (Cache::FastMmap->new(
855 # share_file => $cache_file,
856 # cache_size => $cache_size,
857 # init_file => $cache_init
858 # )) {
859 # print "Unable to intialize cache '$cache_file'\n\n";
860 # exit(1);
861 # };
Akron11c80302016-03-18 19:44:43 +0100862
Akron486f9ab2017-04-22 23:25:19 +0200863
Akron941c1a62016-02-23 17:41:41 +0100864 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100865 if (-d $input[0]) {
866 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100867 my @dirs;
868 my $dir;
869
Akron7d4cdd82016-08-17 21:39:45 +0200870 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100871 while (1) {
872 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200873 push @dirs, $dir;
874 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100875 };
876 last unless $it->next;
877 };
878
879 print "Start processing ...\n";
880 $t = Benchmark->new;
881 $count = scalar @dirs;
882
883 DIRECTORY_LOOP:
884 for (my $i = 0; $i < $count; $i++) {
885
Akrone1dbc382016-07-08 22:24:52 +0200886 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200887 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200888 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200889 );
Akron941c1a62016-02-23 17:41:41 +0100890
891 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200892 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200893
Akron13d56622016-10-31 14:54:49 +0100894 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200895 $pool->finish(
896 0,
Akronda3097e2017-04-23 19:53:57 +0200897 [
898 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
899 undef,
900 $filename
901 ]
Akron486f9ab2017-04-22 23:25:19 +0200902 );
Akron3ec48972016-08-17 23:24:52 +0200903 }
904 else {
Akron4c0cf312016-10-15 16:42:09 +0200905 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200906 };
Akron941c1a62016-02-23 17:41:41 +0100907 };
908 }
909
910 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200911 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200912
Akron941c1a62016-02-23 17:41:41 +0100913 unless ($archive->test_unzip) {
914 print "Unzip is not installed or incompatible.\n\n";
915 exit(1);
916 };
917
Akron08385f62016-03-22 20:37:04 +0100918 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200919 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100920
Akron941c1a62016-02-23 17:41:41 +0100921 print "Start processing ...\n";
922 $t = Benchmark->new;
923 my @dirs = $archive->list_texts;
924 $count = scalar @dirs;
925
926 ARCHIVE_LOOP:
927 for (my $i = 0; $i < $count; $i++) {
928
929 # Split path information
930 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
931
Akrone1dbc382016-07-08 22:24:52 +0200932 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200933 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200934 get_file_name(
935 catfile($corpus, $doc, $text)
936 . '.json' . ($gzip ? '.gz' : '')
937 )
Akrone1dbc382016-07-08 22:24:52 +0200938 );
Akron941c1a62016-02-23 17:41:41 +0100939
940 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200941 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100942
Akron4c0cf312016-10-15 16:42:09 +0200943 # Create temporary file
944 $temp = File::Temp->newdir;
945
Akronbdf434a2016-10-24 17:42:07 +0200946 # TODO: Check if $filename exist at the beginning,
947 # because extraction can be horrible slow!
948
Akron941c1a62016-02-23 17:41:41 +0100949 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200950 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100951
Akron7d4cdd82016-08-17 21:39:45 +0200952 # Create corpus directory
953 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100954
Akron7d4cdd82016-08-17 21:39:45 +0200955 # Temporary directory
956 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100957
Akron7d4cdd82016-08-17 21:39:45 +0200958 # Write file
Akron13d56622016-10-31 14:54:49 +0100959 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200960
Akron4c0cf312016-10-15 16:42:09 +0200961 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100962 $pool->finish(
963 0,
Akronda3097e2017-04-23 19:53:57 +0200964 [
965 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
966 $temp,
967 $filename
968 ]
Akron13d56622016-10-31 14:54:49 +0100969 );
970 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200971 }
972 else {
Akron4c0cf312016-10-15 16:42:09 +0200973 # Delete temporary file
974 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200975 };
Akron941c1a62016-02-23 17:41:41 +0100976 }
Akron7d4cdd82016-08-17 21:39:45 +0200977
978 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100979 else {
Akron4c0cf312016-10-15 16:42:09 +0200980 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100981 };
982 };
983 }
984
985 else {
986 print "Input is neither a directory nor an archive.\n\n";
987 };
988
989 $pool->wait_all_children;
990
Akron11c80302016-03-18 19:44:43 +0100991 # Delete cache file
992 unlink($cache_file) if $cache_delete;
993
Akronda3097e2017-04-23 19:53:57 +0200994 # Close tar filehandle
995 if ($to_tar && $tar_fh) {
996 $tar_archive->finish;
997 $tar_fh->close;
998 print "Wrote to tar archive.\n";
999 };
1000
Akron63f20d42017-04-10 23:40:29 +02001001 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001002 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001003};
Akron941c1a62016-02-23 17:41:41 +01001004
Nils Diewald2db9ad02013-10-29 19:26:43 +00001005
Akron63f20d42017-04-10 23:40:29 +02001006# Cleanup temporary extraction directory
1007if ($extract_dir) {
1008 my $objects = remove_tree($extract_dir, { safe => 1 });
1009 print "Removed directory $extract_dir with $objects objects.\n";
1010};
1011
1012
1013print "\n";
1014
Nils Diewald2db9ad02013-10-29 19:26:43 +00001015__END__
Akron941c1a62016-02-23 17:41:41 +01001016
1017=pod
1018
1019=encoding utf8
1020
1021=head1 NAME
1022
Akronf7ad89e2016-03-16 18:22:47 +01001023korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001024
1025
1026=head1 SYNOPSIS
1027
Akrona76d8352016-10-27 16:27:32 +02001028 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001029
Akron2fd402b2016-10-27 21:26:48 +02001030
Akron941c1a62016-02-23 17:41:41 +01001031=head1 DESCRIPTION
1032
1033L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1034compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001035The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001036
1037
1038=head1 INSTALLATION
1039
1040The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1041
Akronaf386982016-10-12 00:33:25 +02001042 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001043
Akronc13a1702016-03-15 19:33:14 +01001044In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001045be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001046Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001047In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001048
1049=head1 ARGUMENTS
1050
Akrona76d8352016-10-27 16:27:32 +02001051 $ korapxml2krill -z --input <directory> --output <filename>
1052
1053Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001054It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001055
Akron941c1a62016-02-23 17:41:41 +01001056=over 2
1057
1058=item B<archive>
1059
Akron081639e2017-04-21 19:01:39 +02001060 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001061
Akron2fd402b2016-10-27 21:26:48 +02001062Converts an archive of KorAP-XML documents. It expects a directory
1063(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001064
1065=item B<extract>
1066
Akrona76d8352016-10-27 16:27:32 +02001067 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1068
1069Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001070
Akron63f20d42017-04-10 23:40:29 +02001071=item B<serial>
1072
1073 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1074
1075Convert archives sequentially. The inputs are not merged but treated
1076as they are (so they may be premerged or globs).
1077the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001078are created based on the archive name. In case the C<--to-tar> flag is given,
1079the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001080
1081
Akron941c1a62016-02-23 17:41:41 +01001082=back
1083
1084
1085=head1 OPTIONS
1086
1087=over 2
1088
Akrona76d8352016-10-27 16:27:32 +02001089=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001090
Akrona76d8352016-10-27 16:27:32 +02001091Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001092
Akron7606afa2016-10-25 16:23:49 +02001093Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001094document, while C<archive> expects a KorAP-XML corpus folder or a zip
1095file to batch process multiple files.
1096C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001097
Akrona76d8352016-10-27 16:27:32 +02001098C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001099that the first archive listed contains all primary data files
1100and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001101
Akron7606afa2016-10-25 16:23:49 +02001102 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001103
Akron821db3d2017-04-06 21:19:31 +02001104Input may also be defined using BSD glob wildcards.
1105
1106 -i 'file/news*.zip'
1107
1108The extended input array will be sorted in length order, so the shortest
1109path needs to contain all primary data files and all meta data files.
1110
Akron0c3e3752016-06-28 15:55:53 +02001111(The directory structure follows the base directory format,
1112that may include a C<.> root folder.
1113In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001114need to be passed with a hash sign in front of the archive's name.
1115This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001116
Akron7606afa2016-10-25 16:23:49 +02001117To support zip files, a version of C<unzip> needs to be installed that is
1118compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001119
Akron7606afa2016-10-25 16:23:49 +02001120B<The root folder switch using the hash sign is experimental and
1121may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001122
Akron63f20d42017-04-10 23:40:29 +02001123=item B<--input-base|-ib> <directory>
1124
1125The base directory for inputs.
1126
1127
Akron941c1a62016-02-23 17:41:41 +01001128=item B<--output|-o> <directory|file>
1129
1130Output folder for archive processing or
1131document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001132writes to C<STDOUT> by default
1133(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001134
1135=item B<--overwrite|-w>
1136
1137Overwrite files that already exist.
1138
Akron3741f8b2016-12-21 19:55:21 +01001139=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001140
1141Define the default tokenization by specifying
1142the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001143of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001144
Akron3741f8b2016-12-21 19:55:21 +01001145
1146=item B<--base-sentences|-bs> <foundry>#<layer>
1147
1148Define the layer for base sentences.
1149If given, this will be used instead of using C<Base#Sentences>.
1150Currently C<DeReKo#Structure> is the only additional layer supported.
1151
1152 Defaults to unset.
1153
1154
1155=item B<--base-paragraphs|-bp> <foundry>#<layer>
1156
1157Define the layer for base paragraphs.
1158If given, this will be used instead of using C<Base#Paragraphs>.
1159Currently C<DeReKo#Structure> is the only additional layer supported.
1160
1161 Defaults to unset.
1162
1163
Akron41ac10b2017-02-08 22:47:25 +01001164=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1165
1166Define the layer for base pagebreaks.
1167Currently C<DeReKo#Structure> is the only layer supported.
1168
1169 Defaults to unset.
1170
1171
Akron941c1a62016-02-23 17:41:41 +01001172=item B<--skip|-s> <foundry>[#<layer>]
1173
Akronf7ad89e2016-03-16 18:22:47 +01001174Skip specific annotations by specifying the foundry
1175(and optionally the layer with a C<#>-prefix),
1176e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001177Can be set multiple times.
1178
Akronc13a1702016-03-15 19:33:14 +01001179=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001180
Akronf7ad89e2016-03-16 18:22:47 +01001181Convert specific annotations by specifying the foundry
1182(and optionally the layer with a C<#>-prefix),
1183e.g. C<Mate> or C<Mate#Morpho>.
1184Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001185
1186=item B<--primary|-p>
1187
Akronc13a1702016-03-15 19:33:14 +01001188Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001189Can be flagged using C<--no-primary> as well.
1190This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001191
1192=item B<--jobs|-j>
1193
1194Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001195for archive processing.
Akron11c80302016-03-18 19:44:43 +01001196Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001197
1198If C<sequential-extraction> is not set to false, this will
1199also apply to extraction.
1200
Akronc11f7982017-02-21 21:20:14 +01001201Pass -1, and the value will be set automatically to 5
1202times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001203This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001204
Akron9ec88872017-04-12 16:29:06 +02001205=item B<--sequential-extraction|-se>
1206
1207Flag to indicate, if the C<jobs> value also applies to extraction.
1208Some systems may have problems with extracting multiple archives
1209to the same folder at the same time.
1210Can be flagged using C<--no-sequential-extraction> as well.
1211Defaults to C<false>.
1212
Akron35db6e32016-03-17 22:42:22 +01001213=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001214
Akron35db6e32016-03-17 22:42:22 +01001215Define the metadata parser to use. Defaults to C<I5>.
1216Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1217This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001218
1219=item B<--pretty|-y>
1220
Akronc13a1702016-03-15 19:33:14 +01001221Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001222This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001223
1224=item B<--gzip|-z>
1225
Akronf7ad89e2016-03-16 18:22:47 +01001226Compress the output.
1227Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001228
Akron11c80302016-03-18 19:44:43 +01001229=item B<--cache|-c>
1230
1231File to mmap a cache (using L<Cache::FastMmap>).
1232Defaults to C<korapxml2krill.cache> in the calling directory.
1233
1234=item B<--cache-size|-cs>
1235
1236Size of the cache. Defaults to C<50m>.
1237
1238=item B<--cache-init|-ci>
1239
1240Initialize cache file.
1241Can be flagged using C<--no-cache-init> as well.
1242Defaults to C<true>.
1243
1244=item B<--cache-delete|-cd>
1245
1246Delete cache file after processing.
1247Can be flagged using C<--no-cache-delete> as well.
1248Defaults to C<true>.
1249
Akron636aa112017-04-07 18:48:56 +02001250=item B<--config|-cfg>
1251
1252Configure the parameters of your call in a file
1253of key-value pairs with whitespace separator
1254
1255 overwrite 1
1256 token DeReKo#Structure
1257 ...
1258
1259Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001260C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001261C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001262C<output>,
1263C<temp-extract>, C<sequential-extraction>,
1264C<base-sentences>, C<base-paragraphs>,
1265C<base-pagebreaks>,
1266C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001267(semicolon separated), C<anno> (semicolon separated).
1268
Akron81500102017-04-07 20:45:44 +02001269=item B<--temporary-extract|-te>
1270
1271Only valid for the C<archive> command.
1272
1273This will first extract all files into a
1274directory and then will archive.
1275If the directory is given as C<:temp:>,
1276a temporary directory is used.
1277This is especially useful to avoid
1278massive unzipping and potential
1279network latency.
Akron636aa112017-04-07 18:48:56 +02001280
Akrone10ad322016-02-27 10:54:26 +01001281=item B<--sigle|-sg>
1282
Akron20807582016-10-26 17:11:34 +02001283Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001284Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001285I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001286Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001287In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001288On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001289
Akron941c1a62016-02-23 17:41:41 +01001290=item B<--log|-l>
1291
1292The L<Log4perl> log level, defaults to C<ERROR>.
1293
1294=item B<--help|-h>
1295
1296Print this document.
1297
1298=item B<--version|-v>
1299
1300Print version information.
1301
1302=back
1303
Akronc13a1702016-03-15 19:33:14 +01001304=head1 ANNOTATION SUPPORT
1305
1306L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1307developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1308The base foundry with paragraphs, sentences, and the text element are mandatory for
1309L<Krill|https://github.com/KorAP/Krill>.
1310
Akron821db3d2017-04-06 21:19:31 +02001311 Base
1312 #Paragraphs
1313 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001314
Akron821db3d2017-04-06 21:19:31 +02001315 Connexor
1316 #Morpho
1317 #Phrase
1318 #Sentences
1319 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001320
Akron821db3d2017-04-06 21:19:31 +02001321 CoreNLP
1322 #Constituency
1323 #Morpho
1324 #NamedEntities
1325 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001326
Akronce125b62017-06-19 11:54:36 +02001327 CMC
1328 #Morpho
1329
Akron821db3d2017-04-06 21:19:31 +02001330 DeReKo
1331 #Structure
Akronc13a1702016-03-15 19:33:14 +01001332
Akron821db3d2017-04-06 21:19:31 +02001333 DRuKoLa
1334 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001335
Akron821db3d2017-04-06 21:19:31 +02001336 Glemm
1337 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001338
Akron821db3d2017-04-06 21:19:31 +02001339 Malt
1340 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001341
Akron821db3d2017-04-06 21:19:31 +02001342 MarMoT
1343 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001344
Akron821db3d2017-04-06 21:19:31 +02001345 Mate
1346 #Dependency
1347 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001348
Akron821db3d2017-04-06 21:19:31 +02001349 MDParser
1350 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001351
Akron821db3d2017-04-06 21:19:31 +02001352 OpenNLP
1353 #Morpho
1354 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001355
Akron821db3d2017-04-06 21:19:31 +02001356 Sgbr
1357 #Lemma
1358 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001359
Akron821db3d2017-04-06 21:19:31 +02001360 TreeTagger
1361 #Morpho
1362 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001363
Akron821db3d2017-04-06 21:19:31 +02001364 XIP
1365 #Constituency
1366 #Morpho
1367 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001368
Akronc13a1702016-03-15 19:33:14 +01001369
1370More importers are in preparation.
1371New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1372See the built-in annotation importers as examples.
1373
Akron941c1a62016-02-23 17:41:41 +01001374=head1 AVAILABILITY
1375
1376 https://github.com/KorAP/KorAP-XML-Krill
1377
1378
1379=head1 COPYRIGHT AND LICENSE
1380
Akron3ec0a1c2017-01-18 14:41:55 +01001381Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001382
Akron941c1a62016-02-23 17:41:41 +01001383Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001384
Akrona76d8352016-10-27 16:27:32 +02001385Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001386
1387L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1388Corpus Analysis Platform at the
1389L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1390member of the
1391L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1392
1393This program is free software published under the
1394L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1395
1396=cut