blob: 1b994c239d91c34badd39d1111af192f03d9b331 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
27use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akronc11f7982017-02-21 21:20:14 +010029
30# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010031# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010032# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010033
Akronc11f7982017-02-21 21:20:14 +010034# TODO: Use KorAP::XML::ForkPool!
35
Akron941c1a62016-02-23 17:41:41 +010036# CHANGES:
37# ----------------------------------------------------------
38# 2013/11/25
39# - Initial release
40#
41# 2014/10/29
42# - Merges foundry data to create indexer friendly documents
43#
Akron93d620e2016-02-05 19:40:05 +010044# 2016/02/04
45# - renamed to korapxml2krill
46# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010047#
48# 2016/02/12
49# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010050# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010051#
52# 2016/02/14
53# - Added version information
Akron941c1a62016-02-23 17:41:41 +010054# - Added support for archive files
55#
56# 2016/02/15
57# - Fixed temporary directory bug
58# - Improved skipping before unzipping
59# - Added EXPERIMENTAL concurrency support
60#
61# 2016/02/23
62# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010063#
64# 2016/02/27
65# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010066#
67# 2016/03/17
68# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010069#
70# 2016/03/18
71# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020072#
Akronf3f0c942016-06-27 13:27:14 +020073# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020074# - Added multi archive support
75# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020076# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020077#
78# 2016/07/06
79# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020080#
81# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020082# - Fixed temporary path issue in script
83#
84# 2016/10/24
85# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020086#
Akronb4bbec72016-10-26 20:21:02 +020087# 2016/10/24
88# - Added support for document extraction
89#
Akron3741f8b2016-12-21 19:55:21 +010090# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020091# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020092#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/12/21
94# - added support for base-sentences and base-tokenizations
95#
Akron4fa37c32017-01-20 14:43:10 +010096# 2017/01/20
97# - added support for DRuKoLa annotations
98#
Akron41ac10b2017-02-08 22:47:25 +010099# 2017/02/08
100# - added support for pagebreak annotations
101#
Akron821db3d2017-04-06 21:19:31 +0200102# 2017/04/06
103# - added support for wildcards in input
104#
Akron636aa112017-04-07 18:48:56 +0200105# 2017/04/07
106# - support configuration option
Akron81500102017-04-07 20:45:44 +0200107# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200108#
Akron9ec88872017-04-12 16:29:06 +0200109# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200110# - support serial processing
111# - support input root
Akron9ec88872017-04-12 16:29:06 +0200112# - introduced --sequential-extraction flag
Akron941c1a62016-02-23 17:41:41 +0100113# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100114
Akron9ec88872017-04-12 16:29:06 +0200115our $LAST_CHANGE = '2017/04/12';
Akron941c1a62016-02-23 17:41:41 +0100116our $LOCAL = $FindBin::Bin;
117our $VERSION_MSG = <<"VERSION";
118Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
119VERSION
120
Akron63f20d42017-04-10 23:40:29 +0200121# Prototypes
122sub get_file_name_from_glob($);
123sub get_file_name($);
124
Akron941c1a62016-02-23 17:41:41 +0100125# Parse comand
126my $cmd;
127our @ARGV;
128if ($ARGV[0] && index($ARGV[0], '-') != 0) {
129 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100130};
Akron63f20d42017-04-10 23:40:29 +0200131my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100132
Akron5f51d422016-08-16 16:26:43 +0200133my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100134my $text;
Akrone10ad322016-02-27 10:54:26 +0100135
Akron941c1a62016-02-23 17:41:41 +0100136# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000137GetOptions(
Akron08385f62016-03-22 20:37:04 +0100138 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200139 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100140 'output|o=s' => \(my $output),
141 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100142 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200143 'token|t=s' => \(my $token_base),
144 'base-sentences|bs=s' => \(my $base_sentences),
145 'base-paragraphs|bp=s' => \(my $base_paragraphs),
146 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100147 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200148 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100149 'skip|s=s' => \@skip,
150 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200151 'cache|c=s' => \(my $cache_file),
152 'config|cfg=s' => \(my $cfg_file),
153 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200154 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100155 'primary|p!' => \(my $primary),
156 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200157 'jobs|j=i' => \(my $jobs),
Akron9ec88872017-04-12 16:29:06 +0200158 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200159 'cache-size|cs=s' => \(my $cache_size),
160 'cache-delete|cd!' => \(my $cache_delete),
161 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100162 'help|h' => sub {
163 pod2usage(
164 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200165 -verbose => 99,
166 -msg => $VERSION_MSG,
167 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100168 );
169 },
170 'version|v' => sub {
171 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200172 -verbose => 0,
173 -msg => $VERSION_MSG,
174 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100175 )
176 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000177);
178
Akron63f20d42017-04-10 23:40:29 +0200179
Akron636aa112017-04-07 18:48:56 +0200180# Load from configuration
181if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200182 my %config;
183
184 Config::Simple->import_from($cfg_file, \%config);
185
186 # Overwrite
187 if (!defined($overwrite) && defined $config{overwrite}) {
188 $overwrite = $config{overwrite};
189 };
190
191 # Gzip
192 if (!defined($gzip) && defined $config{gzip}) {
193 $gzip = $config{gzip};
194 };
195
196 # Jobs
197 if (!defined($jobs) && defined $config{jobs}) {
198 $jobs = $config{jobs};
199 };
200
Akron63f20d42017-04-10 23:40:29 +0200201 # Input root base directory
202 if (!defined($input_base) && defined $config{'input-base'}) {
203 $input_base = $config{'input-base'};
204 };
205
Akron81500102017-04-07 20:45:44 +0200206 # temporary-extract
207 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
208 $extract_dir = $config{'temporary-extract'};
209 };
210
Akron636aa112017-04-07 18:48:56 +0200211 # Token base
212 if (!defined($token_base) && defined $config{token}) {
213 $token_base = $config{token};
214 };
215
216 # Cache file
217 if (!defined($cache_file) && defined $config{cache}) {
218 $cache_file = $config{cache};
219 };
220
221 # Cache size
222 if (!defined($cache_size) && defined $config{'cache-size'}) {
223 $cache_size = $config{'cache-size'};
224 };
225
226 # Cache delete
227 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
228 $cache_delete = $config{'cache-delete'} ;
229 };
230
231 # Cache init
232 if (!(defined $cache_init) && defined $config{'cache-init'}) {
233 $cache_init = $config{'cache-init'} ;
234 };
235
Akron9ec88872017-04-12 16:29:06 +0200236 # Jobs for extraction
237 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
238 $sequential_extraction = $config{'sequential-extraction'} ;
239 };
240
Akron636aa112017-04-07 18:48:56 +0200241 # Meta
242 if (!(defined $meta) && defined $config{'meta'}) {
243 $meta = $config{'meta'} ;
244 };
245
246 # Output
247 if (!(defined $output) && defined $config{'output'}) {
248 $output = $config{'output'} ;
249 };
250
251 # Base-sentences
252 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
253 $base_sentences = $config{'base-sentences'} ;
254 };
255
256 # Base-paragraphs
257 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
258 $base_paragraphs = $config{'base-paragraphs'} ;
259 };
260
261 # Base-pagebreaks
262 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
263 $base_pagebreaks = $config{'base-pagebreaks'} ;
264 };
265
266 # Log
267 if (!(defined $log_level) && defined $config{'log'}) {
268 $log_level = $config{'log'} ;
269 };
270
271 # Skip
272 if (!scalar(@skip) && defined $config{'skip'}) {
273 @skip = split /\s*;\s*/, $config{'skip'} ;
274 };
275
276 # Sigle
277 if (!scalar(@sigle) && defined $config{'sigle'}) {
278 @sigle = split /\s*;\s*/, $config{'sigle'} ;
279 };
280
281 # Anno
282 if (!scalar(@anno) && defined $config{'anno'}) {
283 @anno = split /\s*;\s*/, $config{'anno'} ;
284 };
285};
286
Akron63f20d42017-04-10 23:40:29 +0200287
Akron636aa112017-04-07 18:48:56 +0200288# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200289$token_base //= 'OpenNLP#tokens';
290$cache_file //= 'korapxml2krill.cache';
291$cache_size //= '50m';
292$jobs //= 0;
293$cache_delete //= 1;
294$cache_init //= 1;
295$sequential_extraction //= 0;
296$log_level //= 'ERROR';
297$base_sentences //= '';
298$base_paragraphs //= '';
299$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200300
Akron821db3d2017-04-06 21:19:31 +0200301$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100302$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100303$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100304
Akron63f20d42017-04-10 23:40:29 +0200305
306# Initialize log4perl object
307Log::Log4perl->init({
308 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
309 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
310 'log4perl.appender.STDERR.layout' => 'PatternLayout',
311 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
312});
313
314my $log = Log::Log4perl->get_logger('main');
315
316
317print "Reading config from $cfg_file\n" if $cfg_file;
318
319
Akron941c1a62016-02-23 17:41:41 +0100320my %ERROR_HASH = (
321 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200322 -verbose => 99,
323 -msg => $VERSION_MSG,
324 -output => '-',
325 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100326);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000327
Akron941c1a62016-02-23 17:41:41 +0100328# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100329pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000330
Akrone1dbc382016-07-08 22:24:52 +0200331# Gzip has no effect, if no output is given
332pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000333
Akronc11f7982017-02-21 21:20:14 +0100334
Akron636aa112017-04-07 18:48:56 +0200335if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100336 state $cores = Sys::Info->new->device('CPU')->count;
337 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200338 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100339};
340
Akron821db3d2017-04-06 21:19:31 +0200341
Akron63f20d42017-04-10 23:40:29 +0200342# Start serial processing
343if ($cmd eq 'serial') {
344
345 if ($output && (!-e $output || !-d $output)) {
346 print "Directory '$output' does not exist.\n\n";
347 exit(0);
348 };
349
350 # Remove all inputs
351 my $remove_next = 0;
352 @keep_argv = @{c(@keep_argv)->grep(
353 sub {
354 # Input flag
355 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
356 $remove_next = 1;
357 return 0;
358 }
359
360 # input value
361 elsif ($remove_next) {
362 $remove_next = 0;
363 return 0;
364 };
365
366 # Pass parameter
367 return 1;
368 }
369 )->to_array};
370
371
372 # Iterate over all inputs
373 foreach (@input) {
374
375 my $new_out = catdir($output, get_file_name_from_glob($_));
376
377 # Create new path
Akronbd3adda2017-04-11 15:00:55 +0200378 if (make_path($new_out) == 0 && !-d $new_out) {
Akron63f20d42017-04-10 23:40:29 +0200379 $log->error("Can\'t create path $new_out");
380 exit(0);
381 };
382
383 # Create archive command
384 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
385 print "Start serial processing of $_ to $new_out\n";
386
387 # Start archiving
388 system @archive_cmd;
389 };
390
391 exit(0);
392};
393
Akrone1dbc382016-07-08 22:24:52 +0200394my %skip;
395$skip{lc($_)} = 1 foreach @skip;
396
397my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100398push(@layers, ['Base', 'Sentences']) unless $base_sentences;
399push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200400
401# Connexor
402push(@layers, ['Connexor', 'Morpho']);
403push(@layers, ['Connexor', 'Syntax']);
404push(@layers, ['Connexor', 'Phrase']);
405push(@layers, ['Connexor', 'Sentences']);
406
407# CoreNLP
408push(@layers, ['CoreNLP', 'NamedEntities']);
409push(@layers, ['CoreNLP', 'Sentences']);
410push(@layers, ['CoreNLP', 'Morpho']);
411push(@layers, ['CoreNLP', 'Constituency']);
412
Akron3741f8b2016-12-21 19:55:21 +0100413
Akrone1dbc382016-07-08 22:24:52 +0200414# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100415my @dereko_attr = ();
416if ($base_sentences eq 'dereko#structure') {
417 push @dereko_attr, 'sentences';
418};
419if ($base_paragraphs eq 'dereko#structure') {
420 push @dereko_attr, 'paragraphs';
421};
Akron636bd9c2017-02-09 17:13:00 +0100422
Akron41ac10b2017-02-08 22:47:25 +0100423if ($base_pagebreaks eq 'dereko#structure') {
424 push @dereko_attr, 'pagebreaks';
425};
426
427if ($dereko_attr[0]) {
428 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100429}
430else {
431 push(@layers, ['DeReKo', 'Structure']);
432};
Akrone1dbc382016-07-08 22:24:52 +0200433
434# Glemm
435push(@layers, ['Glemm', 'Morpho']);
436
437# Malt
438push(@layers, ['Malt', 'Dependency']);
439
440# MDParser
441push(@layers, ['MDParser', 'Dependency']);
442
443# Mate
444push(@layers, ['Mate', 'Morpho']);
445push(@layers, ['Mate', 'Dependency']);
446
447# OpenNLP
448push(@layers, ['OpenNLP', 'Morpho']);
449push(@layers, ['OpenNLP', 'Sentences']);
450
451# Schreibgebrauch
452push(@layers, ['Sgbr', 'Lemma']);
453push(@layers, ['Sgbr', 'Morpho']);
454
455# TreeTagger
456push(@layers, ['TreeTagger', 'Morpho']);
457push(@layers, ['TreeTagger', 'Sentences']);
458
459# XIP
460push(@layers, ['XIP', 'Morpho']);
461push(@layers, ['XIP', 'Constituency']);
462push(@layers, ['XIP', 'Sentences']);
463push(@layers, ['XIP', 'Dependency']);
464
Akron4fa37c32017-01-20 14:43:10 +0100465# DRuKoLa
466push(@layers, ['DRuKoLa', 'Morpho']);
467
Akron3bd942f2017-02-20 20:09:14 +0100468# Marmot
469push(@layers, ['MarMoT', 'Morpho']);
470
Akron4fa37c32017-01-20 14:43:10 +0100471
Akrone1dbc382016-07-08 22:24:52 +0200472# Check filters
473my @filtered_anno;
474if ($skip{'#all'}) {
475 foreach (@anno) {
476 push @filtered_anno, [ split('#', $_) ];
477 };
478}
479
480# Add all annotations that are not skipped
481else {
482 # Add to index file - respect skipping
483 foreach my $info (@layers) {
484 # Skip if Foundry or Foundry#Layer should be skipped
485 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
486 push @filtered_anno, $info;
487 };
488 };
489};
490
491# Get tokenization basis
492my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
493
494# TODO: This should not be initialized for batch
495my $cache = Cache::FastMmap->new(
496 share_file => $cache_file,
497 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200498 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200499);
500
Akron03b24db2016-08-16 20:54:32 +0200501# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200502my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200503 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200504 meta_type => $meta,
505 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200506 foundry => $token_base_foundry,
507 layer => $token_base_layer,
508 gzip => $gzip,
509 log => $log,
510 primary => $primary,
511 pretty => $pretty,
512 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200513);
514
Akron941c1a62016-02-23 17:41:41 +0100515# Get file name based on path information
516sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100517 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200518 if (-d $i) {
519 $i =~ s![^\/]+$!!;
520 };
Akron941c1a62016-02-23 17:41:41 +0100521 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200522
523 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200524 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100525 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100526 $file =~ tr/\//-/;
527 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200528 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100529 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000530};
531
Akron63f20d42017-04-10 23:40:29 +0200532
533sub get_file_name_from_glob ($) {
534 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200535 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200536 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
537 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
538 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
539 $glob =~ s/^-//; # Clean beginning
540 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200541 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200542 return $glob;
543};
544
545
Akrone10ad322016-02-27 10:54:26 +0100546# Convert sigle to path construct
547s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
548
Akron7d4cdd82016-08-17 21:39:45 +0200549if ($cmd) {
550 if ($output && (!-e $output || !-d $output)) {
551 print "Directory '$output' does not exist.\n\n";
552 exit(0);
553 };
554};
555
Akron63f20d42017-04-10 23:40:29 +0200556
557# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200558if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200559
Akron821db3d2017-04-06 21:19:31 +0200560 my @new_input = ();
561
562 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200563 foreach my $wild_card (@input) {
564
565 # Prefix with input root
566 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
567
568 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200569 };
570
Akron63f20d42017-04-10 23:40:29 +0200571 # Sort files by length
572 @input = sort { length($a) <=> length($b) } @new_input;
573
574 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200575};
576
577
Akron941c1a62016-02-23 17:41:41 +0100578# Process a single file
579unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100580 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000581
Akron941c1a62016-02-23 17:41:41 +0100582 BEGIN {
583 $main::TIME = Benchmark->new;
584 $main::LAST_STOP = Benchmark->new;
585 };
586
587 sub stop_time {
588 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200589 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100590 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200591 timestr(timediff($new, $main::LAST_STOP)) .
592 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
593 );
Akron941c1a62016-02-23 17:41:41 +0100594 $main::LAST_STOP = $new;
595 };
596
597 # Create and parse new document
598 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100599
Akron7d4cdd82016-08-17 21:39:45 +0200600 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200601 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100602
Akron11c80302016-03-18 19:44:43 +0100603 # Delete cache file
604 unlink($cache_file) if $cache_delete;
605
Akron5f51d422016-08-16 16:26:43 +0200606 stop_time;
Akron81500102017-04-07 20:45:44 +0200607 exit(1);
608};
609
Nils Diewald59094f22014-11-05 18:20:50 +0000610
Akrone10ad322016-02-27 10:54:26 +0100611# Extract XML files
Akron81500102017-04-07 20:45:44 +0200612if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100613
Akron7d4cdd82016-08-17 21:39:45 +0200614 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200615 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100616
Akron7d4cdd82016-08-17 21:39:45 +0200617 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100618 unless ($archive->test_unzip) {
619 print "Unzip is not installed or incompatible.\n\n";
Akron81500102017-04-07 20:45:44 +0200620 exit(0);
Akrone10ad322016-02-27 10:54:26 +0100621 };
622
Akronb0c88db2016-06-29 16:33:18 +0200623 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200624 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200625
Akron651cb8d2016-08-16 21:44:49 +0200626 my $prefix = 1;
627
Akron03b24db2016-08-16 20:54:32 +0200628 # No sigles given
629 unless (@sigle) {
630
631 # Get files
632 foreach ($archive->list_texts) {
633
634 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200635 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200636
637 # TODO: Make this OS independent
638 push @sigle, join '/', $corpus, $doc, $text;
639 };
Akron20807582016-10-26 17:11:34 +0200640 }
641
642 # Check sigle for doc sigles
643 else {
644 my @new_sigle;
645
646 my $prefix_check = 0;
647
648 # Iterate over all sigle
649 foreach (@sigle) {
650
651 # Sigle is a doc sigle
652 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200653
Akron60a8caa2017-02-17 21:51:27 +0100654 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200655 # Check if a prefix is needed
656 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100657
658 if ($prefix = $archive->check_prefix) {
659 print " with prefix ...";
660 };
Akron20807582016-10-26 17:11:34 +0200661 $prefix_check = 1;
662 };
663
Akron60a8caa2017-02-17 21:51:27 +0100664 print "\n";
665
Akron20807582016-10-26 17:11:34 +0200666 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200667 my $path = ($prefix ? './' : '') . $_;
668
669 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200670 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200671 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200672 ) ? '' : 'not '
673 );
674 print "extracted.\n";
675 }
Akron60a8caa2017-02-17 21:51:27 +0100676
677 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200678 else {
679 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100680
681 unless ($prefix_check) {
682
683 if ($prefix = $archive->check_prefix) {
684 print " with prefix ...";
685 };
686 $prefix_check = 1;
687 };
Akron20807582016-10-26 17:11:34 +0200688 };
689 };
690 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200691 };
692
Akrone10ad322016-02-27 10:54:26 +0100693 # Iterate over all given sigles and extract
694 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100695
Akron2812ba22016-10-28 21:55:59 +0200696 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200697
Akron03b24db2016-08-16 20:54:32 +0200698 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200699 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100700
Akron20807582016-10-26 17:11:34 +0200701 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200702 ($prefix ? './' : '') . $_, $output
703 ) ? '' : 'not '
704 );
Akrone10ad322016-02-27 10:54:26 +0100705 print "extracted.\n";
706 };
Akronb0c88db2016-06-29 16:33:18 +0200707 }
Akron7d4cdd82016-08-17 21:39:45 +0200708
709 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200710 else {
711 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron81500102017-04-07 20:45:44 +0200712 exit(1);
Akrone10ad322016-02-27 10:54:26 +0100713 };
714}
715
Akron81500102017-04-07 20:45:44 +0200716
Akron941c1a62016-02-23 17:41:41 +0100717# Process an archive
718elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000719
Akron81500102017-04-07 20:45:44 +0200720 my $archive_output;
721
722 # First extract, then archive
723 if (defined $extract_dir) {
724
725 # Create new archive object
726 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
727
728 # Check zip capabilities
729 unless ($archive->test_unzip) {
730 print "Unzip is not installed or incompatible.\n\n";
731 exit(0);
732 };
733
734 # Add further annotation archived
735 $archive->attach($_) foreach @input[1..$#input];
736
737 # Create a temporary directory
738 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200739 $extract_dir = tempdir(CLEANUP => 0);
740 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200741 };
742
Akron63f20d42017-04-10 23:40:29 +0200743 # Add some random extra to avoid clashes with multiple archives
744 $extract_dir = catdir($extract_dir, random_string('cccccc'));
745
746 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200747 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200748 @input = ($extract_dir);
749 }
750 else {
751 $log->error('Unable to extract from primary archive ' . $input[0] .
752 ' to ' . $extract_dir);
753 exit(1);
754 };
755 }
756
757 # Can't create archive object
758 else {
759 $log->error('Unable to extract from primary archive ' . $input[0]);
760 exit(1);
761 };
762 };
763
Akrone1dbc382016-07-08 22:24:52 +0200764 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100765
Akron7d4cdd82016-08-17 21:39:45 +0200766 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100767 my $pool = Parallel::ForkManager->new($jobs);
768
Akron7d4cdd82016-08-17 21:39:45 +0200769 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100770 my $iter = 1; # Current text in process
771
772 # Report on fork message
773 $pool->run_on_finish (
774 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200775 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100776 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200777
Akron08385f62016-03-22 20:37:04 +0100778 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200779 ($iter++) . "/$count]" .
780 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200781 ' ' . $data->[0] . "\n";
782 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100783 }
784 );
785
786 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200787 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100788 print "Reading data ...\n";
789
Akron7d4cdd82016-08-17 21:39:45 +0200790 # unless (Cache::FastMmap->new(
791 # share_file => $cache_file,
792 # cache_size => $cache_size,
793 # init_file => $cache_init
794 # )) {
795 # print "Unable to intialize cache '$cache_file'\n\n";
796 # exit(1);
797 # };
Akron11c80302016-03-18 19:44:43 +0100798
Akron941c1a62016-02-23 17:41:41 +0100799 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100800 if (-d $input[0]) {
801 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100802 my @dirs;
803 my $dir;
804
Akron7d4cdd82016-08-17 21:39:45 +0200805 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100806 while (1) {
807 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200808 push @dirs, $dir;
809 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100810 };
811 last unless $it->next;
812 };
813
814 print "Start processing ...\n";
815 $t = Benchmark->new;
816 $count = scalar @dirs;
817
818 DIRECTORY_LOOP:
819 for (my $i = 0; $i < $count; $i++) {
820
Akrone1dbc382016-07-08 22:24:52 +0200821 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200822 $output,
823 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200824 );
Akron941c1a62016-02-23 17:41:41 +0100825
826 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200827 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200828
Akron13d56622016-10-31 14:54:49 +0100829 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
830 $pool->finish(
831 0,
832 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
833 );
Akron3ec48972016-08-17 23:24:52 +0200834 }
835 else {
Akron4c0cf312016-10-15 16:42:09 +0200836 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200837 };
Akron941c1a62016-02-23 17:41:41 +0100838 };
839 }
840
841 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200842 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200843
Akron941c1a62016-02-23 17:41:41 +0100844 unless ($archive->test_unzip) {
845 print "Unzip is not installed or incompatible.\n\n";
846 exit(1);
847 };
848
Akron08385f62016-03-22 20:37:04 +0100849 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200850 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100851
Akron941c1a62016-02-23 17:41:41 +0100852 print "Start processing ...\n";
853 $t = Benchmark->new;
854 my @dirs = $archive->list_texts;
855 $count = scalar @dirs;
856
857 ARCHIVE_LOOP:
858 for (my $i = 0; $i < $count; $i++) {
859
860 # Split path information
861 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
862
Akrone1dbc382016-07-08 22:24:52 +0200863 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200864 $output,
865 get_file_name(
866 catfile($corpus, $doc, $text)
867 . '.json' . ($gzip ? '.gz' : '')
868 )
Akrone1dbc382016-07-08 22:24:52 +0200869 );
Akron941c1a62016-02-23 17:41:41 +0100870
871 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200872 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100873
Akron4c0cf312016-10-15 16:42:09 +0200874 # Create temporary file
875 $temp = File::Temp->newdir;
876
Akronbdf434a2016-10-24 17:42:07 +0200877 # TODO: Check if $filename exist at the beginning,
878 # because extraction can be horrible slow!
879
Akron941c1a62016-02-23 17:41:41 +0100880 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200881 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100882
Akron7d4cdd82016-08-17 21:39:45 +0200883 # Create corpus directory
884 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100885
Akron7d4cdd82016-08-17 21:39:45 +0200886 # Temporary directory
887 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100888
Akron7d4cdd82016-08-17 21:39:45 +0200889 # Write file
Akron13d56622016-10-31 14:54:49 +0100890 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200891 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100892 $pool->finish(
893 0,
894 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
895 );
896 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200897 }
898 else {
Akron4c0cf312016-10-15 16:42:09 +0200899 # Delete temporary file
900 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200901 };
Akron941c1a62016-02-23 17:41:41 +0100902 }
Akron7d4cdd82016-08-17 21:39:45 +0200903
904 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100905 else {
Akron4c0cf312016-10-15 16:42:09 +0200906 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100907 };
908 };
909 }
910
911 else {
912 print "Input is neither a directory nor an archive.\n\n";
913 };
914
915 $pool->wait_all_children;
916
Akron11c80302016-03-18 19:44:43 +0100917 # Delete cache file
918 unlink($cache_file) if $cache_delete;
919
Akron63f20d42017-04-10 23:40:29 +0200920 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100921 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200922};
Akron941c1a62016-02-23 17:41:41 +0100923
Nils Diewald2db9ad02013-10-29 19:26:43 +0000924
Akron63f20d42017-04-10 23:40:29 +0200925# Cleanup temporary extraction directory
926if ($extract_dir) {
927 my $objects = remove_tree($extract_dir, { safe => 1 });
928 print "Removed directory $extract_dir with $objects objects.\n";
929};
930
931
932print "\n";
933
Nils Diewald2db9ad02013-10-29 19:26:43 +0000934__END__
Akron941c1a62016-02-23 17:41:41 +0100935
936=pod
937
938=encoding utf8
939
940=head1 NAME
941
Akronf7ad89e2016-03-16 18:22:47 +0100942korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100943
944
945=head1 SYNOPSIS
946
Akrona76d8352016-10-27 16:27:32 +0200947 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100948
Akron2fd402b2016-10-27 21:26:48 +0200949
Akron941c1a62016-02-23 17:41:41 +0100950=head1 DESCRIPTION
951
952L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
953compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100954The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100955
956
957=head1 INSTALLATION
958
959The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
960
Akronaf386982016-10-12 00:33:25 +0200961 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100962
Akronc13a1702016-03-15 19:33:14 +0100963In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100964be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200965Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200966In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100967
968=head1 ARGUMENTS
969
Akrona76d8352016-10-27 16:27:32 +0200970 $ korapxml2krill -z --input <directory> --output <filename>
971
972Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200973It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200974
Akron941c1a62016-02-23 17:41:41 +0100975=over 2
976
977=item B<archive>
978
Akrona76d8352016-10-27 16:27:32 +0200979 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
980
Akron2fd402b2016-10-27 21:26:48 +0200981Converts an archive of KorAP-XML documents. It expects a directory
982(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100983
984=item B<extract>
985
Akrona76d8352016-10-27 16:27:32 +0200986 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
987
988Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100989
Akron63f20d42017-04-10 23:40:29 +0200990=item B<serial>
991
992 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
993
994Convert archives sequentially. The inputs are not merged but treated
995as they are (so they may be premerged or globs).
996the C<--out> directory is treated as the base directory where subdirectories
997are created based on the archive name.
998
999
Akron941c1a62016-02-23 17:41:41 +01001000=back
1001
1002
1003=head1 OPTIONS
1004
1005=over 2
1006
Akrona76d8352016-10-27 16:27:32 +02001007=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001008
Akrona76d8352016-10-27 16:27:32 +02001009Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001010
Akron7606afa2016-10-25 16:23:49 +02001011Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001012document, while C<archive> expects a KorAP-XML corpus folder or a zip
1013file to batch process multiple files.
1014C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001015
Akrona76d8352016-10-27 16:27:32 +02001016C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001017that the first archive listed contains all primary data files
1018and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001019
Akron7606afa2016-10-25 16:23:49 +02001020 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001021
Akron821db3d2017-04-06 21:19:31 +02001022Input may also be defined using BSD glob wildcards.
1023
1024 -i 'file/news*.zip'
1025
1026The extended input array will be sorted in length order, so the shortest
1027path needs to contain all primary data files and all meta data files.
1028
Akron0c3e3752016-06-28 15:55:53 +02001029(The directory structure follows the base directory format,
1030that may include a C<.> root folder.
1031In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001032need to be passed with a hash sign in front of the archive's name.
1033This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001034
Akron7606afa2016-10-25 16:23:49 +02001035To support zip files, a version of C<unzip> needs to be installed that is
1036compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001037
Akron7606afa2016-10-25 16:23:49 +02001038B<The root folder switch using the hash sign is experimental and
1039may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001040
Akron63f20d42017-04-10 23:40:29 +02001041=item B<--input-base|-ib> <directory>
1042
1043The base directory for inputs.
1044
1045
Akron941c1a62016-02-23 17:41:41 +01001046=item B<--output|-o> <directory|file>
1047
1048Output folder for archive processing or
1049document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001050writes to C<STDOUT> by default
1051(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001052
1053=item B<--overwrite|-w>
1054
1055Overwrite files that already exist.
1056
Akron3741f8b2016-12-21 19:55:21 +01001057=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001058
1059Define the default tokenization by specifying
1060the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001061of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001062
Akron3741f8b2016-12-21 19:55:21 +01001063
1064=item B<--base-sentences|-bs> <foundry>#<layer>
1065
1066Define the layer for base sentences.
1067If given, this will be used instead of using C<Base#Sentences>.
1068Currently C<DeReKo#Structure> is the only additional layer supported.
1069
1070 Defaults to unset.
1071
1072
1073=item B<--base-paragraphs|-bp> <foundry>#<layer>
1074
1075Define the layer for base paragraphs.
1076If given, this will be used instead of using C<Base#Paragraphs>.
1077Currently C<DeReKo#Structure> is the only additional layer supported.
1078
1079 Defaults to unset.
1080
1081
Akron41ac10b2017-02-08 22:47:25 +01001082=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1083
1084Define the layer for base pagebreaks.
1085Currently C<DeReKo#Structure> is the only layer supported.
1086
1087 Defaults to unset.
1088
1089
Akron941c1a62016-02-23 17:41:41 +01001090=item B<--skip|-s> <foundry>[#<layer>]
1091
Akronf7ad89e2016-03-16 18:22:47 +01001092Skip specific annotations by specifying the foundry
1093(and optionally the layer with a C<#>-prefix),
1094e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001095Can be set multiple times.
1096
Akronc13a1702016-03-15 19:33:14 +01001097=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001098
Akronf7ad89e2016-03-16 18:22:47 +01001099Convert specific annotations by specifying the foundry
1100(and optionally the layer with a C<#>-prefix),
1101e.g. C<Mate> or C<Mate#Morpho>.
1102Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001103
1104=item B<--primary|-p>
1105
Akronc13a1702016-03-15 19:33:14 +01001106Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001107Can be flagged using C<--no-primary> as well.
1108This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001109
1110=item B<--jobs|-j>
1111
1112Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001113for archive processing.
Akron11c80302016-03-18 19:44:43 +01001114Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001115
1116If C<sequential-extraction> is not set to false, this will
1117also apply to extraction.
1118
Akronc11f7982017-02-21 21:20:14 +01001119Pass -1, and the value will be set automatically to 5
1120times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001121This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001122
Akron9ec88872017-04-12 16:29:06 +02001123=item B<--sequential-extraction|-se>
1124
1125Flag to indicate, if the C<jobs> value also applies to extraction.
1126Some systems may have problems with extracting multiple archives
1127to the same folder at the same time.
1128Can be flagged using C<--no-sequential-extraction> as well.
1129Defaults to C<false>.
1130
Akron35db6e32016-03-17 22:42:22 +01001131=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001132
Akron35db6e32016-03-17 22:42:22 +01001133Define the metadata parser to use. Defaults to C<I5>.
1134Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1135This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001136
1137=item B<--pretty|-y>
1138
Akronc13a1702016-03-15 19:33:14 +01001139Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001140This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001141
1142=item B<--gzip|-z>
1143
Akronf7ad89e2016-03-16 18:22:47 +01001144Compress the output.
1145Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001146
Akron11c80302016-03-18 19:44:43 +01001147=item B<--cache|-c>
1148
1149File to mmap a cache (using L<Cache::FastMmap>).
1150Defaults to C<korapxml2krill.cache> in the calling directory.
1151
1152=item B<--cache-size|-cs>
1153
1154Size of the cache. Defaults to C<50m>.
1155
1156=item B<--cache-init|-ci>
1157
1158Initialize cache file.
1159Can be flagged using C<--no-cache-init> as well.
1160Defaults to C<true>.
1161
1162=item B<--cache-delete|-cd>
1163
1164Delete cache file after processing.
1165Can be flagged using C<--no-cache-delete> as well.
1166Defaults to C<true>.
1167
Akron636aa112017-04-07 18:48:56 +02001168=item B<--config|-cfg>
1169
1170Configure the parameters of your call in a file
1171of key-value pairs with whitespace separator
1172
1173 overwrite 1
1174 token DeReKo#Structure
1175 ...
1176
1177Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001178C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001179C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001180C<output>,
1181C<temp-extract>, C<sequential-extraction>,
1182C<base-sentences>, C<base-paragraphs>,
1183C<base-pagebreaks>,
1184C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001185(semicolon separated), C<anno> (semicolon separated).
1186
Akron81500102017-04-07 20:45:44 +02001187=item B<--temporary-extract|-te>
1188
1189Only valid for the C<archive> command.
1190
1191This will first extract all files into a
1192directory and then will archive.
1193If the directory is given as C<:temp:>,
1194a temporary directory is used.
1195This is especially useful to avoid
1196massive unzipping and potential
1197network latency.
Akron636aa112017-04-07 18:48:56 +02001198
Akrone10ad322016-02-27 10:54:26 +01001199=item B<--sigle|-sg>
1200
Akron20807582016-10-26 17:11:34 +02001201Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001202Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001203I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001204Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001205In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001206On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001207
Akron941c1a62016-02-23 17:41:41 +01001208=item B<--log|-l>
1209
1210The L<Log4perl> log level, defaults to C<ERROR>.
1211
1212=item B<--help|-h>
1213
1214Print this document.
1215
1216=item B<--version|-v>
1217
1218Print version information.
1219
1220=back
1221
Akronc13a1702016-03-15 19:33:14 +01001222=head1 ANNOTATION SUPPORT
1223
1224L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1225developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1226The base foundry with paragraphs, sentences, and the text element are mandatory for
1227L<Krill|https://github.com/KorAP/Krill>.
1228
Akron821db3d2017-04-06 21:19:31 +02001229 Base
1230 #Paragraphs
1231 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001232
Akron821db3d2017-04-06 21:19:31 +02001233 Connexor
1234 #Morpho
1235 #Phrase
1236 #Sentences
1237 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001238
Akron821db3d2017-04-06 21:19:31 +02001239 CoreNLP
1240 #Constituency
1241 #Morpho
1242 #NamedEntities
1243 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001244
Akron821db3d2017-04-06 21:19:31 +02001245 DeReKo
1246 #Structure
Akronc13a1702016-03-15 19:33:14 +01001247
Akron821db3d2017-04-06 21:19:31 +02001248 DRuKoLa
1249 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001250
Akron821db3d2017-04-06 21:19:31 +02001251 Glemm
1252 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001253
Akron821db3d2017-04-06 21:19:31 +02001254 Malt
1255 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001256
Akron821db3d2017-04-06 21:19:31 +02001257 MarMoT
1258 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001259
Akron821db3d2017-04-06 21:19:31 +02001260 Mate
1261 #Dependency
1262 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001263
Akron821db3d2017-04-06 21:19:31 +02001264 MDParser
1265 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001266
Akron821db3d2017-04-06 21:19:31 +02001267 OpenNLP
1268 #Morpho
1269 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001270
Akron821db3d2017-04-06 21:19:31 +02001271 Sgbr
1272 #Lemma
1273 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001274
Akron821db3d2017-04-06 21:19:31 +02001275 TreeTagger
1276 #Morpho
1277 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001278
Akron821db3d2017-04-06 21:19:31 +02001279 XIP
1280 #Constituency
1281 #Morpho
1282 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001283
Akronc13a1702016-03-15 19:33:14 +01001284
1285More importers are in preparation.
1286New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1287See the built-in annotation importers as examples.
1288
Akron941c1a62016-02-23 17:41:41 +01001289=head1 AVAILABILITY
1290
1291 https://github.com/KorAP/KorAP-XML-Krill
1292
1293
1294=head1 COPYRIGHT AND LICENSE
1295
Akron3ec0a1c2017-01-18 14:41:55 +01001296Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001297
Akron941c1a62016-02-23 17:41:41 +01001298Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001299
Akrona76d8352016-10-27 16:27:32 +02001300Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001301
1302L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1303Corpus Analysis Platform at the
1304L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1305member of the
1306L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1307
1308This program is free software published under the
1309L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1310
1311=cut