blob: 2ac6ddb34a5f54e23c658550202095fb254f3959 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
27use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akronc11f7982017-02-21 21:20:14 +010029
30# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010031# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010032# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010033
Akronc11f7982017-02-21 21:20:14 +010034# TODO: Use KorAP::XML::ForkPool!
35
Akron941c1a62016-02-23 17:41:41 +010036# CHANGES:
37# ----------------------------------------------------------
38# 2013/11/25
39# - Initial release
40#
41# 2014/10/29
42# - Merges foundry data to create indexer friendly documents
43#
Akron93d620e2016-02-05 19:40:05 +010044# 2016/02/04
45# - renamed to korapxml2krill
46# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010047#
48# 2016/02/12
49# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010050# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010051#
52# 2016/02/14
53# - Added version information
Akron941c1a62016-02-23 17:41:41 +010054# - Added support for archive files
55#
56# 2016/02/15
57# - Fixed temporary directory bug
58# - Improved skipping before unzipping
59# - Added EXPERIMENTAL concurrency support
60#
61# 2016/02/23
62# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010063#
64# 2016/02/27
65# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010066#
67# 2016/03/17
68# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010069#
70# 2016/03/18
71# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020072#
Akronf3f0c942016-06-27 13:27:14 +020073# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020074# - Added multi archive support
75# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020076# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020077#
78# 2016/07/06
79# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020080#
81# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020082# - Fixed temporary path issue in script
83#
84# 2016/10/24
85# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020086#
Akronb4bbec72016-10-26 20:21:02 +020087# 2016/10/24
88# - Added support for document extraction
89#
Akron3741f8b2016-12-21 19:55:21 +010090# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020091# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020092#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/12/21
94# - added support for base-sentences and base-tokenizations
95#
Akron4fa37c32017-01-20 14:43:10 +010096# 2017/01/20
97# - added support for DRuKoLa annotations
98#
Akron41ac10b2017-02-08 22:47:25 +010099# 2017/02/08
100# - added support for pagebreak annotations
101#
Akron821db3d2017-04-06 21:19:31 +0200102# 2017/04/06
103# - added support for wildcards in input
104#
Akron636aa112017-04-07 18:48:56 +0200105# 2017/04/07
106# - support configuration option
Akron81500102017-04-07 20:45:44 +0200107# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200108#
Akron63f20d42017-04-10 23:40:29 +0200109# 2017/04/10
110# - support serial processing
111# - support input root
Akron941c1a62016-02-23 17:41:41 +0100112# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100113
Akron636aa112017-04-07 18:48:56 +0200114our $LAST_CHANGE = '2017/04/07';
Akron941c1a62016-02-23 17:41:41 +0100115our $LOCAL = $FindBin::Bin;
116our $VERSION_MSG = <<"VERSION";
117Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
118VERSION
119
Akron63f20d42017-04-10 23:40:29 +0200120# Prototypes
121sub get_file_name_from_glob($);
122sub get_file_name($);
123
Akron941c1a62016-02-23 17:41:41 +0100124# Parse comand
125my $cmd;
126our @ARGV;
127if ($ARGV[0] && index($ARGV[0], '-') != 0) {
128 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100129};
Akron63f20d42017-04-10 23:40:29 +0200130my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100131
Akron5f51d422016-08-16 16:26:43 +0200132my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100133my $text;
Akrone10ad322016-02-27 10:54:26 +0100134
Akron941c1a62016-02-23 17:41:41 +0100135# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000136GetOptions(
Akron08385f62016-03-22 20:37:04 +0100137 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200138 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100139 'output|o=s' => \(my $output),
140 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100141 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200142 'token|t=s' => \(my $token_base),
143 'base-sentences|bs=s' => \(my $base_sentences),
144 'base-paragraphs|bp=s' => \(my $base_paragraphs),
145 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100146 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200147 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100148 'skip|s=s' => \@skip,
149 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200150 'cache|c=s' => \(my $cache_file),
151 'config|cfg=s' => \(my $cfg_file),
152 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200153 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100154 'primary|p!' => \(my $primary),
155 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200156 'jobs|j=i' => \(my $jobs),
157 'cache-size|cs=s' => \(my $cache_size),
158 'cache-delete|cd!' => \(my $cache_delete),
159 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100160 'help|h' => sub {
161 pod2usage(
162 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200163 -verbose => 99,
164 -msg => $VERSION_MSG,
165 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100166 );
167 },
168 'version|v' => sub {
169 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200170 -verbose => 0,
171 -msg => $VERSION_MSG,
172 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100173 )
174 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000175);
176
Akron63f20d42017-04-10 23:40:29 +0200177
Akron636aa112017-04-07 18:48:56 +0200178# Load from configuration
179if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200180 my %config;
181
182 Config::Simple->import_from($cfg_file, \%config);
183
184 # Overwrite
185 if (!defined($overwrite) && defined $config{overwrite}) {
186 $overwrite = $config{overwrite};
187 };
188
189 # Gzip
190 if (!defined($gzip) && defined $config{gzip}) {
191 $gzip = $config{gzip};
192 };
193
194 # Jobs
195 if (!defined($jobs) && defined $config{jobs}) {
196 $jobs = $config{jobs};
197 };
198
Akron63f20d42017-04-10 23:40:29 +0200199 # Input root base directory
200 if (!defined($input_base) && defined $config{'input-base'}) {
201 $input_base = $config{'input-base'};
202 };
203
Akron81500102017-04-07 20:45:44 +0200204 # temporary-extract
205 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
206 $extract_dir = $config{'temporary-extract'};
207 };
208
Akron636aa112017-04-07 18:48:56 +0200209 # Token base
210 if (!defined($token_base) && defined $config{token}) {
211 $token_base = $config{token};
212 };
213
214 # Cache file
215 if (!defined($cache_file) && defined $config{cache}) {
216 $cache_file = $config{cache};
217 };
218
219 # Cache size
220 if (!defined($cache_size) && defined $config{'cache-size'}) {
221 $cache_size = $config{'cache-size'};
222 };
223
224 # Cache delete
225 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
226 $cache_delete = $config{'cache-delete'} ;
227 };
228
229 # Cache init
230 if (!(defined $cache_init) && defined $config{'cache-init'}) {
231 $cache_init = $config{'cache-init'} ;
232 };
233
234 # Meta
235 if (!(defined $meta) && defined $config{'meta'}) {
236 $meta = $config{'meta'} ;
237 };
238
239 # Output
240 if (!(defined $output) && defined $config{'output'}) {
241 $output = $config{'output'} ;
242 };
243
244 # Base-sentences
245 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
246 $base_sentences = $config{'base-sentences'} ;
247 };
248
249 # Base-paragraphs
250 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
251 $base_paragraphs = $config{'base-paragraphs'} ;
252 };
253
254 # Base-pagebreaks
255 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
256 $base_pagebreaks = $config{'base-pagebreaks'} ;
257 };
258
259 # Log
260 if (!(defined $log_level) && defined $config{'log'}) {
261 $log_level = $config{'log'} ;
262 };
263
264 # Skip
265 if (!scalar(@skip) && defined $config{'skip'}) {
266 @skip = split /\s*;\s*/, $config{'skip'} ;
267 };
268
269 # Sigle
270 if (!scalar(@sigle) && defined $config{'sigle'}) {
271 @sigle = split /\s*;\s*/, $config{'sigle'} ;
272 };
273
274 # Anno
275 if (!scalar(@anno) && defined $config{'anno'}) {
276 @anno = split /\s*;\s*/, $config{'anno'} ;
277 };
278};
279
Akron63f20d42017-04-10 23:40:29 +0200280
Akron636aa112017-04-07 18:48:56 +0200281# Set default token base
282$token_base //= 'OpenNLP#tokens';
283$cache_file //= 'korapxml2krill.cache';
284$cache_size //= '50m';
285$jobs //= 0;
286$cache_delete //= 1;
287$cache_init //= 1;
288$log_level //= 'ERROR';
289$base_sentences //= '';
290$base_paragraphs //= '';
291$base_pagebreaks //= '';
292
Akron821db3d2017-04-06 21:19:31 +0200293$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100294$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100295$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100296
Akron63f20d42017-04-10 23:40:29 +0200297
298# Initialize log4perl object
299Log::Log4perl->init({
300 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
301 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
302 'log4perl.appender.STDERR.layout' => 'PatternLayout',
303 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
304});
305
306my $log = Log::Log4perl->get_logger('main');
307
308
309print "Reading config from $cfg_file\n" if $cfg_file;
310
311
Akron941c1a62016-02-23 17:41:41 +0100312my %ERROR_HASH = (
313 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200314 -verbose => 99,
315 -msg => $VERSION_MSG,
316 -output => '-',
317 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100318);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000319
Akron941c1a62016-02-23 17:41:41 +0100320# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100321pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000322
Akrone1dbc382016-07-08 22:24:52 +0200323# Gzip has no effect, if no output is given
324pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000325
Akronc11f7982017-02-21 21:20:14 +0100326
Akron636aa112017-04-07 18:48:56 +0200327if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100328 state $cores = Sys::Info->new->device('CPU')->count;
329 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200330 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100331};
332
Akron821db3d2017-04-06 21:19:31 +0200333
Akron63f20d42017-04-10 23:40:29 +0200334# Start serial processing
335if ($cmd eq 'serial') {
336
337 if ($output && (!-e $output || !-d $output)) {
338 print "Directory '$output' does not exist.\n\n";
339 exit(0);
340 };
341
342 # Remove all inputs
343 my $remove_next = 0;
344 @keep_argv = @{c(@keep_argv)->grep(
345 sub {
346 # Input flag
347 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
348 $remove_next = 1;
349 return 0;
350 }
351
352 # input value
353 elsif ($remove_next) {
354 $remove_next = 0;
355 return 0;
356 };
357
358 # Pass parameter
359 return 1;
360 }
361 )->to_array};
362
363
364 # Iterate over all inputs
365 foreach (@input) {
366
367 my $new_out = catdir($output, get_file_name_from_glob($_));
368
369 # Create new path
370 unless (make_path($new_out)) {
371 $log->error("Can\'t create path $new_out");
372 exit(0);
373 };
374
375 # Create archive command
376 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
377 print "Start serial processing of $_ to $new_out\n";
378
379 # Start archiving
380 system @archive_cmd;
381 };
382
383 exit(0);
384};
385
Akrone1dbc382016-07-08 22:24:52 +0200386my %skip;
387$skip{lc($_)} = 1 foreach @skip;
388
389my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100390push(@layers, ['Base', 'Sentences']) unless $base_sentences;
391push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200392
393# Connexor
394push(@layers, ['Connexor', 'Morpho']);
395push(@layers, ['Connexor', 'Syntax']);
396push(@layers, ['Connexor', 'Phrase']);
397push(@layers, ['Connexor', 'Sentences']);
398
399# CoreNLP
400push(@layers, ['CoreNLP', 'NamedEntities']);
401push(@layers, ['CoreNLP', 'Sentences']);
402push(@layers, ['CoreNLP', 'Morpho']);
403push(@layers, ['CoreNLP', 'Constituency']);
404
Akron3741f8b2016-12-21 19:55:21 +0100405
Akrone1dbc382016-07-08 22:24:52 +0200406# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100407my @dereko_attr = ();
408if ($base_sentences eq 'dereko#structure') {
409 push @dereko_attr, 'sentences';
410};
411if ($base_paragraphs eq 'dereko#structure') {
412 push @dereko_attr, 'paragraphs';
413};
Akron636bd9c2017-02-09 17:13:00 +0100414
Akron41ac10b2017-02-08 22:47:25 +0100415if ($base_pagebreaks eq 'dereko#structure') {
416 push @dereko_attr, 'pagebreaks';
417};
418
419if ($dereko_attr[0]) {
420 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100421}
422else {
423 push(@layers, ['DeReKo', 'Structure']);
424};
Akrone1dbc382016-07-08 22:24:52 +0200425
426# Glemm
427push(@layers, ['Glemm', 'Morpho']);
428
429# Malt
430push(@layers, ['Malt', 'Dependency']);
431
432# MDParser
433push(@layers, ['MDParser', 'Dependency']);
434
435# Mate
436push(@layers, ['Mate', 'Morpho']);
437push(@layers, ['Mate', 'Dependency']);
438
439# OpenNLP
440push(@layers, ['OpenNLP', 'Morpho']);
441push(@layers, ['OpenNLP', 'Sentences']);
442
443# Schreibgebrauch
444push(@layers, ['Sgbr', 'Lemma']);
445push(@layers, ['Sgbr', 'Morpho']);
446
447# TreeTagger
448push(@layers, ['TreeTagger', 'Morpho']);
449push(@layers, ['TreeTagger', 'Sentences']);
450
451# XIP
452push(@layers, ['XIP', 'Morpho']);
453push(@layers, ['XIP', 'Constituency']);
454push(@layers, ['XIP', 'Sentences']);
455push(@layers, ['XIP', 'Dependency']);
456
Akron4fa37c32017-01-20 14:43:10 +0100457# DRuKoLa
458push(@layers, ['DRuKoLa', 'Morpho']);
459
Akron3bd942f2017-02-20 20:09:14 +0100460# Marmot
461push(@layers, ['MarMoT', 'Morpho']);
462
Akron4fa37c32017-01-20 14:43:10 +0100463
Akrone1dbc382016-07-08 22:24:52 +0200464# Check filters
465my @filtered_anno;
466if ($skip{'#all'}) {
467 foreach (@anno) {
468 push @filtered_anno, [ split('#', $_) ];
469 };
470}
471
472# Add all annotations that are not skipped
473else {
474 # Add to index file - respect skipping
475 foreach my $info (@layers) {
476 # Skip if Foundry or Foundry#Layer should be skipped
477 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
478 push @filtered_anno, $info;
479 };
480 };
481};
482
483# Get tokenization basis
484my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
485
486# TODO: This should not be initialized for batch
487my $cache = Cache::FastMmap->new(
488 share_file => $cache_file,
489 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200490 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200491);
492
Akron03b24db2016-08-16 20:54:32 +0200493# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200494my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200495 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200496 meta_type => $meta,
497 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200498 foundry => $token_base_foundry,
499 layer => $token_base_layer,
500 gzip => $gzip,
501 log => $log,
502 primary => $primary,
503 pretty => $pretty,
504 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200505);
506
Akron941c1a62016-02-23 17:41:41 +0100507# Get file name based on path information
508sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100509 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200510 if (-d $i) {
511 $i =~ s![^\/]+$!!;
512 };
Akron941c1a62016-02-23 17:41:41 +0100513 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200514
515 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200516 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100517 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100518 $file =~ tr/\//-/;
519 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200520 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100521 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000522};
523
Akron63f20d42017-04-10 23:40:29 +0200524
525sub get_file_name_from_glob ($) {
526 my $glob = shift;
527 $glob =~ s/\.zip$//; # Remove file extension
528 $glob =~ s{\/([^\/]+?)$}{$1}; # Remove path unix style
529 $glob =~ s{\\([^\\]+?)$}{$1}; # Remove path windows style
530 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
531 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
532 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
533 $glob =~ s/^-//; # Clean beginning
534 $glob =~ s/-$//; # Clean end
535 return $glob;
536};
537
538
Akrone10ad322016-02-27 10:54:26 +0100539# Convert sigle to path construct
540s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
541
Akron7d4cdd82016-08-17 21:39:45 +0200542if ($cmd) {
543 if ($output && (!-e $output || !-d $output)) {
544 print "Directory '$output' does not exist.\n\n";
545 exit(0);
546 };
547};
548
Akron63f20d42017-04-10 23:40:29 +0200549
550# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200551if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200552
Akron821db3d2017-04-06 21:19:31 +0200553 my @new_input = ();
554
555 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200556 foreach my $wild_card (@input) {
557
558 # Prefix with input root
559 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
560
561 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200562 };
563
Akron63f20d42017-04-10 23:40:29 +0200564 # Sort files by length
565 @input = sort { length($a) <=> length($b) } @new_input;
566
567 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200568};
569
570
Akron941c1a62016-02-23 17:41:41 +0100571# Process a single file
572unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100573 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000574
Akron941c1a62016-02-23 17:41:41 +0100575 BEGIN {
576 $main::TIME = Benchmark->new;
577 $main::LAST_STOP = Benchmark->new;
578 };
579
580 sub stop_time {
581 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200582 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100583 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200584 timestr(timediff($new, $main::LAST_STOP)) .
585 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
586 );
Akron941c1a62016-02-23 17:41:41 +0100587 $main::LAST_STOP = $new;
588 };
589
590 # Create and parse new document
591 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100592
Akron7d4cdd82016-08-17 21:39:45 +0200593 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200594 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100595
Akron11c80302016-03-18 19:44:43 +0100596 # Delete cache file
597 unlink($cache_file) if $cache_delete;
598
Akron5f51d422016-08-16 16:26:43 +0200599 stop_time;
Akron81500102017-04-07 20:45:44 +0200600 exit(1);
601};
602
Nils Diewald59094f22014-11-05 18:20:50 +0000603
Akrone10ad322016-02-27 10:54:26 +0100604# Extract XML files
Akron81500102017-04-07 20:45:44 +0200605if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100606
Akron7d4cdd82016-08-17 21:39:45 +0200607 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200608 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100609
Akron7d4cdd82016-08-17 21:39:45 +0200610 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100611 unless ($archive->test_unzip) {
612 print "Unzip is not installed or incompatible.\n\n";
Akron81500102017-04-07 20:45:44 +0200613 exit(0);
Akrone10ad322016-02-27 10:54:26 +0100614 };
615
Akronb0c88db2016-06-29 16:33:18 +0200616 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200617 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200618
Akron651cb8d2016-08-16 21:44:49 +0200619 my $prefix = 1;
620
Akron03b24db2016-08-16 20:54:32 +0200621 # No sigles given
622 unless (@sigle) {
623
624 # Get files
625 foreach ($archive->list_texts) {
626
627 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200628 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200629
630 # TODO: Make this OS independent
631 push @sigle, join '/', $corpus, $doc, $text;
632 };
Akron20807582016-10-26 17:11:34 +0200633 }
634
635 # Check sigle for doc sigles
636 else {
637 my @new_sigle;
638
639 my $prefix_check = 0;
640
641 # Iterate over all sigle
642 foreach (@sigle) {
643
644 # Sigle is a doc sigle
645 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200646
Akron60a8caa2017-02-17 21:51:27 +0100647 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200648 # Check if a prefix is needed
649 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100650
651 if ($prefix = $archive->check_prefix) {
652 print " with prefix ...";
653 };
Akron20807582016-10-26 17:11:34 +0200654 $prefix_check = 1;
655 };
656
Akron60a8caa2017-02-17 21:51:27 +0100657 print "\n";
658
Akron20807582016-10-26 17:11:34 +0200659 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200660 my $path = ($prefix ? './' : '') . $_;
661
662 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200663 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200664 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200665 ) ? '' : 'not '
666 );
667 print "extracted.\n";
668 }
Akron60a8caa2017-02-17 21:51:27 +0100669
670 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200671 else {
672 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100673
674 unless ($prefix_check) {
675
676 if ($prefix = $archive->check_prefix) {
677 print " with prefix ...";
678 };
679 $prefix_check = 1;
680 };
Akron20807582016-10-26 17:11:34 +0200681 };
682 };
683 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200684 };
685
Akrone10ad322016-02-27 10:54:26 +0100686 # Iterate over all given sigles and extract
687 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100688
Akron2812ba22016-10-28 21:55:59 +0200689 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200690
Akron03b24db2016-08-16 20:54:32 +0200691 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200692 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100693
Akron20807582016-10-26 17:11:34 +0200694 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200695 ($prefix ? './' : '') . $_, $output
696 ) ? '' : 'not '
697 );
Akrone10ad322016-02-27 10:54:26 +0100698 print "extracted.\n";
699 };
Akronb0c88db2016-06-29 16:33:18 +0200700 }
Akron7d4cdd82016-08-17 21:39:45 +0200701
702 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200703 else {
704 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron81500102017-04-07 20:45:44 +0200705 exit(1);
Akrone10ad322016-02-27 10:54:26 +0100706 };
707}
708
Akron81500102017-04-07 20:45:44 +0200709
Akron941c1a62016-02-23 17:41:41 +0100710# Process an archive
711elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000712
Akron81500102017-04-07 20:45:44 +0200713 my $archive_output;
714
715 # First extract, then archive
716 if (defined $extract_dir) {
717
718 # Create new archive object
719 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
720
721 # Check zip capabilities
722 unless ($archive->test_unzip) {
723 print "Unzip is not installed or incompatible.\n\n";
724 exit(0);
725 };
726
727 # Add further annotation archived
728 $archive->attach($_) foreach @input[1..$#input];
729
730 # Create a temporary directory
731 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200732 $extract_dir = tempdir(CLEANUP => 0);
733 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200734 };
735
Akron63f20d42017-04-10 23:40:29 +0200736 # Add some random extra to avoid clashes with multiple archives
737 $extract_dir = catdir($extract_dir, random_string('cccccc'));
738
739 # Extract to temprary directory
Akron81500102017-04-07 20:45:44 +0200740 if ($archive->extract_all($extract_dir, $jobs)) {
741 @input = ($extract_dir);
742 }
743 else {
744 $log->error('Unable to extract from primary archive ' . $input[0] .
745 ' to ' . $extract_dir);
746 exit(1);
747 };
748 }
749
750 # Can't create archive object
751 else {
752 $log->error('Unable to extract from primary archive ' . $input[0]);
753 exit(1);
754 };
755 };
756
Akrone1dbc382016-07-08 22:24:52 +0200757 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100758
Akron7d4cdd82016-08-17 21:39:45 +0200759 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100760 my $pool = Parallel::ForkManager->new($jobs);
761
Akron7d4cdd82016-08-17 21:39:45 +0200762 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100763 my $iter = 1; # Current text in process
764
765 # Report on fork message
766 $pool->run_on_finish (
767 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200768 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100769 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200770
Akron08385f62016-03-22 20:37:04 +0100771 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200772 ($iter++) . "/$count]" .
773 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200774 ' ' . $data->[0] . "\n";
775 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100776 }
777 );
778
779 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200780 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100781 print "Reading data ...\n";
782
Akron7d4cdd82016-08-17 21:39:45 +0200783 # unless (Cache::FastMmap->new(
784 # share_file => $cache_file,
785 # cache_size => $cache_size,
786 # init_file => $cache_init
787 # )) {
788 # print "Unable to intialize cache '$cache_file'\n\n";
789 # exit(1);
790 # };
Akron11c80302016-03-18 19:44:43 +0100791
Akron941c1a62016-02-23 17:41:41 +0100792 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100793 if (-d $input[0]) {
794 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100795 my @dirs;
796 my $dir;
797
Akron7d4cdd82016-08-17 21:39:45 +0200798 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100799 while (1) {
800 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200801 push @dirs, $dir;
802 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100803 };
804 last unless $it->next;
805 };
806
807 print "Start processing ...\n";
808 $t = Benchmark->new;
809 $count = scalar @dirs;
810
811 DIRECTORY_LOOP:
812 for (my $i = 0; $i < $count; $i++) {
813
Akrone1dbc382016-07-08 22:24:52 +0200814 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200815 $output,
816 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200817 );
Akron941c1a62016-02-23 17:41:41 +0100818
819 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200820 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200821
Akron13d56622016-10-31 14:54:49 +0100822 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
823 $pool->finish(
824 0,
825 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
826 );
Akron3ec48972016-08-17 23:24:52 +0200827 }
828 else {
Akron4c0cf312016-10-15 16:42:09 +0200829 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200830 };
Akron941c1a62016-02-23 17:41:41 +0100831 };
832 }
833
834 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200835 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200836
Akron941c1a62016-02-23 17:41:41 +0100837 unless ($archive->test_unzip) {
838 print "Unzip is not installed or incompatible.\n\n";
839 exit(1);
840 };
841
Akron08385f62016-03-22 20:37:04 +0100842 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200843 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100844
Akron941c1a62016-02-23 17:41:41 +0100845 print "Start processing ...\n";
846 $t = Benchmark->new;
847 my @dirs = $archive->list_texts;
848 $count = scalar @dirs;
849
850 ARCHIVE_LOOP:
851 for (my $i = 0; $i < $count; $i++) {
852
853 # Split path information
854 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
855
Akrone1dbc382016-07-08 22:24:52 +0200856 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200857 $output,
858 get_file_name(
859 catfile($corpus, $doc, $text)
860 . '.json' . ($gzip ? '.gz' : '')
861 )
Akrone1dbc382016-07-08 22:24:52 +0200862 );
Akron941c1a62016-02-23 17:41:41 +0100863
864 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200865 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100866
Akron4c0cf312016-10-15 16:42:09 +0200867 # Create temporary file
868 $temp = File::Temp->newdir;
869
Akronbdf434a2016-10-24 17:42:07 +0200870 # TODO: Check if $filename exist at the beginning,
871 # because extraction can be horrible slow!
872
Akron941c1a62016-02-23 17:41:41 +0100873 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200874 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100875
Akron7d4cdd82016-08-17 21:39:45 +0200876 # Create corpus directory
877 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100878
Akron7d4cdd82016-08-17 21:39:45 +0200879 # Temporary directory
880 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100881
Akron7d4cdd82016-08-17 21:39:45 +0200882 # Write file
Akron13d56622016-10-31 14:54:49 +0100883 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200884 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100885 $pool->finish(
886 0,
887 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
888 );
889 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200890 }
891 else {
Akron4c0cf312016-10-15 16:42:09 +0200892 # Delete temporary file
893 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200894 };
Akron941c1a62016-02-23 17:41:41 +0100895 }
Akron7d4cdd82016-08-17 21:39:45 +0200896
897 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100898 else {
Akron4c0cf312016-10-15 16:42:09 +0200899 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100900 };
901 };
902 }
903
904 else {
905 print "Input is neither a directory nor an archive.\n\n";
906 };
907
908 $pool->wait_all_children;
909
Akron11c80302016-03-18 19:44:43 +0100910 # Delete cache file
911 unlink($cache_file) if $cache_delete;
912
Akron63f20d42017-04-10 23:40:29 +0200913 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100914 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200915};
Akron941c1a62016-02-23 17:41:41 +0100916
Nils Diewald2db9ad02013-10-29 19:26:43 +0000917
Akron63f20d42017-04-10 23:40:29 +0200918# Cleanup temporary extraction directory
919if ($extract_dir) {
920 my $objects = remove_tree($extract_dir, { safe => 1 });
921 print "Removed directory $extract_dir with $objects objects.\n";
922};
923
924
925print "\n";
926
Nils Diewald2db9ad02013-10-29 19:26:43 +0000927__END__
Akron941c1a62016-02-23 17:41:41 +0100928
929=pod
930
931=encoding utf8
932
933=head1 NAME
934
Akronf7ad89e2016-03-16 18:22:47 +0100935korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100936
937
938=head1 SYNOPSIS
939
Akrona76d8352016-10-27 16:27:32 +0200940 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100941
Akron2fd402b2016-10-27 21:26:48 +0200942
Akron941c1a62016-02-23 17:41:41 +0100943=head1 DESCRIPTION
944
945L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
946compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100947The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100948
949
950=head1 INSTALLATION
951
952The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
953
Akronaf386982016-10-12 00:33:25 +0200954 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100955
Akronc13a1702016-03-15 19:33:14 +0100956In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100957be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200958Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200959In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100960
961=head1 ARGUMENTS
962
Akrona76d8352016-10-27 16:27:32 +0200963 $ korapxml2krill -z --input <directory> --output <filename>
964
965Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200966It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200967
Akron941c1a62016-02-23 17:41:41 +0100968=over 2
969
970=item B<archive>
971
Akrona76d8352016-10-27 16:27:32 +0200972 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
973
Akron2fd402b2016-10-27 21:26:48 +0200974Converts an archive of KorAP-XML documents. It expects a directory
975(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100976
977=item B<extract>
978
Akrona76d8352016-10-27 16:27:32 +0200979 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
980
981Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100982
Akron63f20d42017-04-10 23:40:29 +0200983=item B<serial>
984
985 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
986
987Convert archives sequentially. The inputs are not merged but treated
988as they are (so they may be premerged or globs).
989the C<--out> directory is treated as the base directory where subdirectories
990are created based on the archive name.
991
992
Akron941c1a62016-02-23 17:41:41 +0100993=back
994
995
996=head1 OPTIONS
997
998=over 2
999
Akrona76d8352016-10-27 16:27:32 +02001000=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001001
Akrona76d8352016-10-27 16:27:32 +02001002Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001003
Akron7606afa2016-10-25 16:23:49 +02001004Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001005document, while C<archive> expects a KorAP-XML corpus folder or a zip
1006file to batch process multiple files.
1007C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001008
Akrona76d8352016-10-27 16:27:32 +02001009C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001010that the first archive listed contains all primary data files
1011and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001012
Akron7606afa2016-10-25 16:23:49 +02001013 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001014
Akron821db3d2017-04-06 21:19:31 +02001015Input may also be defined using BSD glob wildcards.
1016
1017 -i 'file/news*.zip'
1018
1019The extended input array will be sorted in length order, so the shortest
1020path needs to contain all primary data files and all meta data files.
1021
Akron0c3e3752016-06-28 15:55:53 +02001022(The directory structure follows the base directory format,
1023that may include a C<.> root folder.
1024In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001025need to be passed with a hash sign in front of the archive's name.
1026This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001027
Akron7606afa2016-10-25 16:23:49 +02001028To support zip files, a version of C<unzip> needs to be installed that is
1029compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001030
Akron7606afa2016-10-25 16:23:49 +02001031B<The root folder switch using the hash sign is experimental and
1032may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001033
Akron63f20d42017-04-10 23:40:29 +02001034=item B<--input-base|-ib> <directory>
1035
1036The base directory for inputs.
1037
1038
Akron941c1a62016-02-23 17:41:41 +01001039=item B<--output|-o> <directory|file>
1040
1041Output folder for archive processing or
1042document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001043writes to C<STDOUT> by default
1044(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001045
1046=item B<--overwrite|-w>
1047
1048Overwrite files that already exist.
1049
Akron3741f8b2016-12-21 19:55:21 +01001050=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001051
1052Define the default tokenization by specifying
1053the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001054of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001055
Akron3741f8b2016-12-21 19:55:21 +01001056
1057=item B<--base-sentences|-bs> <foundry>#<layer>
1058
1059Define the layer for base sentences.
1060If given, this will be used instead of using C<Base#Sentences>.
1061Currently C<DeReKo#Structure> is the only additional layer supported.
1062
1063 Defaults to unset.
1064
1065
1066=item B<--base-paragraphs|-bp> <foundry>#<layer>
1067
1068Define the layer for base paragraphs.
1069If given, this will be used instead of using C<Base#Paragraphs>.
1070Currently C<DeReKo#Structure> is the only additional layer supported.
1071
1072 Defaults to unset.
1073
1074
Akron41ac10b2017-02-08 22:47:25 +01001075=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1076
1077Define the layer for base pagebreaks.
1078Currently C<DeReKo#Structure> is the only layer supported.
1079
1080 Defaults to unset.
1081
1082
Akron941c1a62016-02-23 17:41:41 +01001083=item B<--skip|-s> <foundry>[#<layer>]
1084
Akronf7ad89e2016-03-16 18:22:47 +01001085Skip specific annotations by specifying the foundry
1086(and optionally the layer with a C<#>-prefix),
1087e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001088Can be set multiple times.
1089
Akronc13a1702016-03-15 19:33:14 +01001090=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001091
Akronf7ad89e2016-03-16 18:22:47 +01001092Convert specific annotations by specifying the foundry
1093(and optionally the layer with a C<#>-prefix),
1094e.g. C<Mate> or C<Mate#Morpho>.
1095Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001096
1097=item B<--primary|-p>
1098
Akronc13a1702016-03-15 19:33:14 +01001099Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001100Can be flagged using C<--no-primary> as well.
1101This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001102
1103=item B<--jobs|-j>
1104
1105Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001106for archive processing.
Akron11c80302016-03-18 19:44:43 +01001107Defaults to C<0> (everything runs in a single process).
Akronc11f7982017-02-21 21:20:14 +01001108Pass -1, and the value will be set automatically to 5
1109times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001110This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001111
Akron35db6e32016-03-17 22:42:22 +01001112=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001113
Akron35db6e32016-03-17 22:42:22 +01001114Define the metadata parser to use. Defaults to C<I5>.
1115Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1116This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001117
1118=item B<--pretty|-y>
1119
Akronc13a1702016-03-15 19:33:14 +01001120Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001121This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001122
1123=item B<--gzip|-z>
1124
Akronf7ad89e2016-03-16 18:22:47 +01001125Compress the output.
1126Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001127
Akron11c80302016-03-18 19:44:43 +01001128=item B<--cache|-c>
1129
1130File to mmap a cache (using L<Cache::FastMmap>).
1131Defaults to C<korapxml2krill.cache> in the calling directory.
1132
1133=item B<--cache-size|-cs>
1134
1135Size of the cache. Defaults to C<50m>.
1136
1137=item B<--cache-init|-ci>
1138
1139Initialize cache file.
1140Can be flagged using C<--no-cache-init> as well.
1141Defaults to C<true>.
1142
1143=item B<--cache-delete|-cd>
1144
1145Delete cache file after processing.
1146Can be flagged using C<--no-cache-delete> as well.
1147Defaults to C<true>.
1148
Akron636aa112017-04-07 18:48:56 +02001149=item B<--config|-cfg>
1150
1151Configure the parameters of your call in a file
1152of key-value pairs with whitespace separator
1153
1154 overwrite 1
1155 token DeReKo#Structure
1156 ...
1157
1158Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001159C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001160C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron81500102017-04-07 20:45:44 +02001161C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
Akron636aa112017-04-07 18:48:56 +02001162C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
1163(semicolon separated), C<anno> (semicolon separated).
1164
Akron81500102017-04-07 20:45:44 +02001165=item B<--temporary-extract|-te>
1166
1167Only valid for the C<archive> command.
1168
1169This will first extract all files into a
1170directory and then will archive.
1171If the directory is given as C<:temp:>,
1172a temporary directory is used.
1173This is especially useful to avoid
1174massive unzipping and potential
1175network latency.
Akron636aa112017-04-07 18:48:56 +02001176
Akrone10ad322016-02-27 10:54:26 +01001177=item B<--sigle|-sg>
1178
Akron20807582016-10-26 17:11:34 +02001179Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001180Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001181I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001182Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001183In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001184On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001185
Akron941c1a62016-02-23 17:41:41 +01001186=item B<--log|-l>
1187
1188The L<Log4perl> log level, defaults to C<ERROR>.
1189
1190=item B<--help|-h>
1191
1192Print this document.
1193
1194=item B<--version|-v>
1195
1196Print version information.
1197
1198=back
1199
Akronc13a1702016-03-15 19:33:14 +01001200=head1 ANNOTATION SUPPORT
1201
1202L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1203developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1204The base foundry with paragraphs, sentences, and the text element are mandatory for
1205L<Krill|https://github.com/KorAP/Krill>.
1206
Akron821db3d2017-04-06 21:19:31 +02001207 Base
1208 #Paragraphs
1209 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001210
Akron821db3d2017-04-06 21:19:31 +02001211 Connexor
1212 #Morpho
1213 #Phrase
1214 #Sentences
1215 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001216
Akron821db3d2017-04-06 21:19:31 +02001217 CoreNLP
1218 #Constituency
1219 #Morpho
1220 #NamedEntities
1221 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001222
Akron821db3d2017-04-06 21:19:31 +02001223 DeReKo
1224 #Structure
Akronc13a1702016-03-15 19:33:14 +01001225
Akron821db3d2017-04-06 21:19:31 +02001226 DRuKoLa
1227 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001228
Akron821db3d2017-04-06 21:19:31 +02001229 Glemm
1230 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001231
Akron821db3d2017-04-06 21:19:31 +02001232 Malt
1233 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001234
Akron821db3d2017-04-06 21:19:31 +02001235 MarMoT
1236 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001237
Akron821db3d2017-04-06 21:19:31 +02001238 Mate
1239 #Dependency
1240 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001241
Akron821db3d2017-04-06 21:19:31 +02001242 MDParser
1243 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001244
Akron821db3d2017-04-06 21:19:31 +02001245 OpenNLP
1246 #Morpho
1247 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001248
Akron821db3d2017-04-06 21:19:31 +02001249 Sgbr
1250 #Lemma
1251 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001252
Akron821db3d2017-04-06 21:19:31 +02001253 TreeTagger
1254 #Morpho
1255 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001256
Akron821db3d2017-04-06 21:19:31 +02001257 XIP
1258 #Constituency
1259 #Morpho
1260 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001261
Akronc13a1702016-03-15 19:33:14 +01001262
1263More importers are in preparation.
1264New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1265See the built-in annotation importers as examples.
1266
Akron941c1a62016-02-23 17:41:41 +01001267=head1 AVAILABILITY
1268
1269 https://github.com/KorAP/KorAP-XML-Krill
1270
1271
1272=head1 COPYRIGHT AND LICENSE
1273
Akron3ec0a1c2017-01-18 14:41:55 +01001274Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001275
Akron941c1a62016-02-23 17:41:41 +01001276Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001277
Akrona76d8352016-10-27 16:27:32 +02001278Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001279
1280L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1281Corpus Analysis Platform at the
1282L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1283member of the
1284L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1285
1286This program is free software published under the
1287L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1288
1289=cut