blob: 00e9216f23b5b884e38a3bf0c2c0ed1e5820c521 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
27use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akronc11f7982017-02-21 21:20:14 +010029
30# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010031# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010032# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010033
Akronc11f7982017-02-21 21:20:14 +010034# TODO: Use KorAP::XML::ForkPool!
35
Akron941c1a62016-02-23 17:41:41 +010036# CHANGES:
37# ----------------------------------------------------------
38# 2013/11/25
39# - Initial release
40#
41# 2014/10/29
42# - Merges foundry data to create indexer friendly documents
43#
Akron93d620e2016-02-05 19:40:05 +010044# 2016/02/04
45# - renamed to korapxml2krill
46# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010047#
48# 2016/02/12
49# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010050# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010051#
52# 2016/02/14
53# - Added version information
Akron941c1a62016-02-23 17:41:41 +010054# - Added support for archive files
55#
56# 2016/02/15
57# - Fixed temporary directory bug
58# - Improved skipping before unzipping
59# - Added EXPERIMENTAL concurrency support
60#
61# 2016/02/23
62# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010063#
64# 2016/02/27
65# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010066#
67# 2016/03/17
68# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010069#
70# 2016/03/18
71# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020072#
Akronf3f0c942016-06-27 13:27:14 +020073# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020074# - Added multi archive support
75# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020076# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020077#
78# 2016/07/06
79# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020080#
81# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020082# - Fixed temporary path issue in script
83#
84# 2016/10/24
85# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020086#
Akronb4bbec72016-10-26 20:21:02 +020087# 2016/10/24
88# - Added support for document extraction
89#
Akron3741f8b2016-12-21 19:55:21 +010090# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020091# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020092#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/12/21
94# - added support for base-sentences and base-tokenizations
95#
Akron4fa37c32017-01-20 14:43:10 +010096# 2017/01/20
97# - added support for DRuKoLa annotations
98#
Akron41ac10b2017-02-08 22:47:25 +010099# 2017/02/08
100# - added support for pagebreak annotations
101#
Akron821db3d2017-04-06 21:19:31 +0200102# 2017/04/06
103# - added support for wildcards in input
104#
Akron636aa112017-04-07 18:48:56 +0200105# 2017/04/07
106# - support configuration option
Akron81500102017-04-07 20:45:44 +0200107# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200108#
Akron63f20d42017-04-10 23:40:29 +0200109# 2017/04/10
110# - support serial processing
111# - support input root
Akron941c1a62016-02-23 17:41:41 +0100112# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100113
Akron636aa112017-04-07 18:48:56 +0200114our $LAST_CHANGE = '2017/04/07';
Akron941c1a62016-02-23 17:41:41 +0100115our $LOCAL = $FindBin::Bin;
116our $VERSION_MSG = <<"VERSION";
117Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
118VERSION
119
Akron63f20d42017-04-10 23:40:29 +0200120# Prototypes
121sub get_file_name_from_glob($);
122sub get_file_name($);
123
Akron941c1a62016-02-23 17:41:41 +0100124# Parse comand
125my $cmd;
126our @ARGV;
127if ($ARGV[0] && index($ARGV[0], '-') != 0) {
128 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100129};
Akron63f20d42017-04-10 23:40:29 +0200130my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100131
Akron5f51d422016-08-16 16:26:43 +0200132my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100133my $text;
Akrone10ad322016-02-27 10:54:26 +0100134
Akron941c1a62016-02-23 17:41:41 +0100135# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000136GetOptions(
Akron08385f62016-03-22 20:37:04 +0100137 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200138 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100139 'output|o=s' => \(my $output),
140 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100141 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200142 'token|t=s' => \(my $token_base),
143 'base-sentences|bs=s' => \(my $base_sentences),
144 'base-paragraphs|bp=s' => \(my $base_paragraphs),
145 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100146 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200147 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100148 'skip|s=s' => \@skip,
149 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200150 'cache|c=s' => \(my $cache_file),
151 'config|cfg=s' => \(my $cfg_file),
152 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200153 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100154 'primary|p!' => \(my $primary),
155 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200156 'jobs|j=i' => \(my $jobs),
157 'cache-size|cs=s' => \(my $cache_size),
158 'cache-delete|cd!' => \(my $cache_delete),
159 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100160 'help|h' => sub {
161 pod2usage(
162 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200163 -verbose => 99,
164 -msg => $VERSION_MSG,
165 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100166 );
167 },
168 'version|v' => sub {
169 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200170 -verbose => 0,
171 -msg => $VERSION_MSG,
172 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100173 )
174 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000175);
176
Akron63f20d42017-04-10 23:40:29 +0200177
Akron636aa112017-04-07 18:48:56 +0200178# Load from configuration
179if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200180 my %config;
181
182 Config::Simple->import_from($cfg_file, \%config);
183
184 # Overwrite
185 if (!defined($overwrite) && defined $config{overwrite}) {
186 $overwrite = $config{overwrite};
187 };
188
189 # Gzip
190 if (!defined($gzip) && defined $config{gzip}) {
191 $gzip = $config{gzip};
192 };
193
194 # Jobs
195 if (!defined($jobs) && defined $config{jobs}) {
196 $jobs = $config{jobs};
197 };
198
Akron63f20d42017-04-10 23:40:29 +0200199 # Input root base directory
200 if (!defined($input_base) && defined $config{'input-base'}) {
201 $input_base = $config{'input-base'};
202 };
203
Akron81500102017-04-07 20:45:44 +0200204 # temporary-extract
205 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
206 $extract_dir = $config{'temporary-extract'};
207 };
208
Akron636aa112017-04-07 18:48:56 +0200209 # Token base
210 if (!defined($token_base) && defined $config{token}) {
211 $token_base = $config{token};
212 };
213
214 # Cache file
215 if (!defined($cache_file) && defined $config{cache}) {
216 $cache_file = $config{cache};
217 };
218
219 # Cache size
220 if (!defined($cache_size) && defined $config{'cache-size'}) {
221 $cache_size = $config{'cache-size'};
222 };
223
224 # Cache delete
225 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
226 $cache_delete = $config{'cache-delete'} ;
227 };
228
229 # Cache init
230 if (!(defined $cache_init) && defined $config{'cache-init'}) {
231 $cache_init = $config{'cache-init'} ;
232 };
233
234 # Meta
235 if (!(defined $meta) && defined $config{'meta'}) {
236 $meta = $config{'meta'} ;
237 };
238
239 # Output
240 if (!(defined $output) && defined $config{'output'}) {
241 $output = $config{'output'} ;
242 };
243
244 # Base-sentences
245 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
246 $base_sentences = $config{'base-sentences'} ;
247 };
248
249 # Base-paragraphs
250 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
251 $base_paragraphs = $config{'base-paragraphs'} ;
252 };
253
254 # Base-pagebreaks
255 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
256 $base_pagebreaks = $config{'base-pagebreaks'} ;
257 };
258
259 # Log
260 if (!(defined $log_level) && defined $config{'log'}) {
261 $log_level = $config{'log'} ;
262 };
263
264 # Skip
265 if (!scalar(@skip) && defined $config{'skip'}) {
266 @skip = split /\s*;\s*/, $config{'skip'} ;
267 };
268
269 # Sigle
270 if (!scalar(@sigle) && defined $config{'sigle'}) {
271 @sigle = split /\s*;\s*/, $config{'sigle'} ;
272 };
273
274 # Anno
275 if (!scalar(@anno) && defined $config{'anno'}) {
276 @anno = split /\s*;\s*/, $config{'anno'} ;
277 };
278};
279
Akron63f20d42017-04-10 23:40:29 +0200280
Akron636aa112017-04-07 18:48:56 +0200281# Set default token base
282$token_base //= 'OpenNLP#tokens';
283$cache_file //= 'korapxml2krill.cache';
284$cache_size //= '50m';
285$jobs //= 0;
286$cache_delete //= 1;
287$cache_init //= 1;
288$log_level //= 'ERROR';
289$base_sentences //= '';
290$base_paragraphs //= '';
291$base_pagebreaks //= '';
292
Akron821db3d2017-04-06 21:19:31 +0200293$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100294$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100295$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100296
Akron63f20d42017-04-10 23:40:29 +0200297
298# Initialize log4perl object
299Log::Log4perl->init({
300 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
301 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
302 'log4perl.appender.STDERR.layout' => 'PatternLayout',
303 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
304});
305
306my $log = Log::Log4perl->get_logger('main');
307
308
309print "Reading config from $cfg_file\n" if $cfg_file;
310
311
Akron941c1a62016-02-23 17:41:41 +0100312my %ERROR_HASH = (
313 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200314 -verbose => 99,
315 -msg => $VERSION_MSG,
316 -output => '-',
317 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100318);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000319
Akron941c1a62016-02-23 17:41:41 +0100320# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100321pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000322
Akrone1dbc382016-07-08 22:24:52 +0200323# Gzip has no effect, if no output is given
324pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000325
Akronc11f7982017-02-21 21:20:14 +0100326
Akron636aa112017-04-07 18:48:56 +0200327if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100328 state $cores = Sys::Info->new->device('CPU')->count;
329 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200330 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100331};
332
Akron821db3d2017-04-06 21:19:31 +0200333
Akron63f20d42017-04-10 23:40:29 +0200334# Start serial processing
335if ($cmd eq 'serial') {
336
337 if ($output && (!-e $output || !-d $output)) {
338 print "Directory '$output' does not exist.\n\n";
339 exit(0);
340 };
341
342 # Remove all inputs
343 my $remove_next = 0;
344 @keep_argv = @{c(@keep_argv)->grep(
345 sub {
346 # Input flag
347 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
348 $remove_next = 1;
349 return 0;
350 }
351
352 # input value
353 elsif ($remove_next) {
354 $remove_next = 0;
355 return 0;
356 };
357
358 # Pass parameter
359 return 1;
360 }
361 )->to_array};
362
363
364 # Iterate over all inputs
365 foreach (@input) {
366
367 my $new_out = catdir($output, get_file_name_from_glob($_));
368
369 # Create new path
Akronbd3adda2017-04-11 15:00:55 +0200370 if (make_path($new_out) == 0 && !-d $new_out) {
Akron63f20d42017-04-10 23:40:29 +0200371 $log->error("Can\'t create path $new_out");
372 exit(0);
373 };
374
375 # Create archive command
376 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
377 print "Start serial processing of $_ to $new_out\n";
378
379 # Start archiving
380 system @archive_cmd;
381 };
382
383 exit(0);
384};
385
Akrone1dbc382016-07-08 22:24:52 +0200386my %skip;
387$skip{lc($_)} = 1 foreach @skip;
388
389my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100390push(@layers, ['Base', 'Sentences']) unless $base_sentences;
391push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200392
393# Connexor
394push(@layers, ['Connexor', 'Morpho']);
395push(@layers, ['Connexor', 'Syntax']);
396push(@layers, ['Connexor', 'Phrase']);
397push(@layers, ['Connexor', 'Sentences']);
398
399# CoreNLP
400push(@layers, ['CoreNLP', 'NamedEntities']);
401push(@layers, ['CoreNLP', 'Sentences']);
402push(@layers, ['CoreNLP', 'Morpho']);
403push(@layers, ['CoreNLP', 'Constituency']);
404
Akron3741f8b2016-12-21 19:55:21 +0100405
Akrone1dbc382016-07-08 22:24:52 +0200406# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100407my @dereko_attr = ();
408if ($base_sentences eq 'dereko#structure') {
409 push @dereko_attr, 'sentences';
410};
411if ($base_paragraphs eq 'dereko#structure') {
412 push @dereko_attr, 'paragraphs';
413};
Akron636bd9c2017-02-09 17:13:00 +0100414
Akron41ac10b2017-02-08 22:47:25 +0100415if ($base_pagebreaks eq 'dereko#structure') {
416 push @dereko_attr, 'pagebreaks';
417};
418
419if ($dereko_attr[0]) {
420 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100421}
422else {
423 push(@layers, ['DeReKo', 'Structure']);
424};
Akrone1dbc382016-07-08 22:24:52 +0200425
426# Glemm
427push(@layers, ['Glemm', 'Morpho']);
428
429# Malt
430push(@layers, ['Malt', 'Dependency']);
431
432# MDParser
433push(@layers, ['MDParser', 'Dependency']);
434
435# Mate
436push(@layers, ['Mate', 'Morpho']);
437push(@layers, ['Mate', 'Dependency']);
438
439# OpenNLP
440push(@layers, ['OpenNLP', 'Morpho']);
441push(@layers, ['OpenNLP', 'Sentences']);
442
443# Schreibgebrauch
444push(@layers, ['Sgbr', 'Lemma']);
445push(@layers, ['Sgbr', 'Morpho']);
446
447# TreeTagger
448push(@layers, ['TreeTagger', 'Morpho']);
449push(@layers, ['TreeTagger', 'Sentences']);
450
451# XIP
452push(@layers, ['XIP', 'Morpho']);
453push(@layers, ['XIP', 'Constituency']);
454push(@layers, ['XIP', 'Sentences']);
455push(@layers, ['XIP', 'Dependency']);
456
Akron4fa37c32017-01-20 14:43:10 +0100457# DRuKoLa
458push(@layers, ['DRuKoLa', 'Morpho']);
459
Akron3bd942f2017-02-20 20:09:14 +0100460# Marmot
461push(@layers, ['MarMoT', 'Morpho']);
462
Akron4fa37c32017-01-20 14:43:10 +0100463
Akrone1dbc382016-07-08 22:24:52 +0200464# Check filters
465my @filtered_anno;
466if ($skip{'#all'}) {
467 foreach (@anno) {
468 push @filtered_anno, [ split('#', $_) ];
469 };
470}
471
472# Add all annotations that are not skipped
473else {
474 # Add to index file - respect skipping
475 foreach my $info (@layers) {
476 # Skip if Foundry or Foundry#Layer should be skipped
477 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
478 push @filtered_anno, $info;
479 };
480 };
481};
482
483# Get tokenization basis
484my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
485
486# TODO: This should not be initialized for batch
487my $cache = Cache::FastMmap->new(
488 share_file => $cache_file,
489 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200490 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200491);
492
Akron03b24db2016-08-16 20:54:32 +0200493# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200494my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200495 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200496 meta_type => $meta,
497 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200498 foundry => $token_base_foundry,
499 layer => $token_base_layer,
500 gzip => $gzip,
501 log => $log,
502 primary => $primary,
503 pretty => $pretty,
504 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200505);
506
Akron941c1a62016-02-23 17:41:41 +0100507# Get file name based on path information
508sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100509 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200510 if (-d $i) {
511 $i =~ s![^\/]+$!!;
512 };
Akron941c1a62016-02-23 17:41:41 +0100513 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200514
515 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200516 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100517 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100518 $file =~ tr/\//-/;
519 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200520 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100521 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000522};
523
Akron63f20d42017-04-10 23:40:29 +0200524
525sub get_file_name_from_glob ($) {
526 my $glob = shift;
527 $glob =~ s/\.zip$//; # Remove file extension
Akronbd3adda2017-04-11 15:00:55 +0200528 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200529 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
530 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
531 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
532 $glob =~ s/^-//; # Clean beginning
533 $glob =~ s/-$//; # Clean end
534 return $glob;
535};
536
537
Akrone10ad322016-02-27 10:54:26 +0100538# Convert sigle to path construct
539s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
540
Akron7d4cdd82016-08-17 21:39:45 +0200541if ($cmd) {
542 if ($output && (!-e $output || !-d $output)) {
543 print "Directory '$output' does not exist.\n\n";
544 exit(0);
545 };
546};
547
Akron63f20d42017-04-10 23:40:29 +0200548
549# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200550if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200551
Akron821db3d2017-04-06 21:19:31 +0200552 my @new_input = ();
553
554 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200555 foreach my $wild_card (@input) {
556
557 # Prefix with input root
558 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
559
560 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200561 };
562
Akron63f20d42017-04-10 23:40:29 +0200563 # Sort files by length
564 @input = sort { length($a) <=> length($b) } @new_input;
565
566 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200567};
568
569
Akron941c1a62016-02-23 17:41:41 +0100570# Process a single file
571unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100572 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000573
Akron941c1a62016-02-23 17:41:41 +0100574 BEGIN {
575 $main::TIME = Benchmark->new;
576 $main::LAST_STOP = Benchmark->new;
577 };
578
579 sub stop_time {
580 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200581 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100582 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200583 timestr(timediff($new, $main::LAST_STOP)) .
584 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
585 );
Akron941c1a62016-02-23 17:41:41 +0100586 $main::LAST_STOP = $new;
587 };
588
589 # Create and parse new document
590 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100591
Akron7d4cdd82016-08-17 21:39:45 +0200592 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200593 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100594
Akron11c80302016-03-18 19:44:43 +0100595 # Delete cache file
596 unlink($cache_file) if $cache_delete;
597
Akron5f51d422016-08-16 16:26:43 +0200598 stop_time;
Akron81500102017-04-07 20:45:44 +0200599 exit(1);
600};
601
Nils Diewald59094f22014-11-05 18:20:50 +0000602
Akrone10ad322016-02-27 10:54:26 +0100603# Extract XML files
Akron81500102017-04-07 20:45:44 +0200604if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100605
Akron7d4cdd82016-08-17 21:39:45 +0200606 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200607 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100608
Akron7d4cdd82016-08-17 21:39:45 +0200609 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100610 unless ($archive->test_unzip) {
611 print "Unzip is not installed or incompatible.\n\n";
Akron81500102017-04-07 20:45:44 +0200612 exit(0);
Akrone10ad322016-02-27 10:54:26 +0100613 };
614
Akronb0c88db2016-06-29 16:33:18 +0200615 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200616 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200617
Akron651cb8d2016-08-16 21:44:49 +0200618 my $prefix = 1;
619
Akron03b24db2016-08-16 20:54:32 +0200620 # No sigles given
621 unless (@sigle) {
622
623 # Get files
624 foreach ($archive->list_texts) {
625
626 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200627 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200628
629 # TODO: Make this OS independent
630 push @sigle, join '/', $corpus, $doc, $text;
631 };
Akron20807582016-10-26 17:11:34 +0200632 }
633
634 # Check sigle for doc sigles
635 else {
636 my @new_sigle;
637
638 my $prefix_check = 0;
639
640 # Iterate over all sigle
641 foreach (@sigle) {
642
643 # Sigle is a doc sigle
644 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200645
Akron60a8caa2017-02-17 21:51:27 +0100646 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200647 # Check if a prefix is needed
648 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100649
650 if ($prefix = $archive->check_prefix) {
651 print " with prefix ...";
652 };
Akron20807582016-10-26 17:11:34 +0200653 $prefix_check = 1;
654 };
655
Akron60a8caa2017-02-17 21:51:27 +0100656 print "\n";
657
Akron20807582016-10-26 17:11:34 +0200658 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200659 my $path = ($prefix ? './' : '') . $_;
660
661 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200662 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200663 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200664 ) ? '' : 'not '
665 );
666 print "extracted.\n";
667 }
Akron60a8caa2017-02-17 21:51:27 +0100668
669 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200670 else {
671 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100672
673 unless ($prefix_check) {
674
675 if ($prefix = $archive->check_prefix) {
676 print " with prefix ...";
677 };
678 $prefix_check = 1;
679 };
Akron20807582016-10-26 17:11:34 +0200680 };
681 };
682 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200683 };
684
Akrone10ad322016-02-27 10:54:26 +0100685 # Iterate over all given sigles and extract
686 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100687
Akron2812ba22016-10-28 21:55:59 +0200688 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200689
Akron03b24db2016-08-16 20:54:32 +0200690 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200691 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100692
Akron20807582016-10-26 17:11:34 +0200693 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200694 ($prefix ? './' : '') . $_, $output
695 ) ? '' : 'not '
696 );
Akrone10ad322016-02-27 10:54:26 +0100697 print "extracted.\n";
698 };
Akronb0c88db2016-06-29 16:33:18 +0200699 }
Akron7d4cdd82016-08-17 21:39:45 +0200700
701 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200702 else {
703 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron81500102017-04-07 20:45:44 +0200704 exit(1);
Akrone10ad322016-02-27 10:54:26 +0100705 };
706}
707
Akron81500102017-04-07 20:45:44 +0200708
Akron941c1a62016-02-23 17:41:41 +0100709# Process an archive
710elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000711
Akron81500102017-04-07 20:45:44 +0200712 my $archive_output;
713
714 # First extract, then archive
715 if (defined $extract_dir) {
716
717 # Create new archive object
718 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
719
720 # Check zip capabilities
721 unless ($archive->test_unzip) {
722 print "Unzip is not installed or incompatible.\n\n";
723 exit(0);
724 };
725
726 # Add further annotation archived
727 $archive->attach($_) foreach @input[1..$#input];
728
729 # Create a temporary directory
730 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200731 $extract_dir = tempdir(CLEANUP => 0);
732 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200733 };
734
Akron63f20d42017-04-10 23:40:29 +0200735 # Add some random extra to avoid clashes with multiple archives
736 $extract_dir = catdir($extract_dir, random_string('cccccc'));
737
738 # Extract to temprary directory
Akron81500102017-04-07 20:45:44 +0200739 if ($archive->extract_all($extract_dir, $jobs)) {
740 @input = ($extract_dir);
741 }
742 else {
743 $log->error('Unable to extract from primary archive ' . $input[0] .
744 ' to ' . $extract_dir);
745 exit(1);
746 };
747 }
748
749 # Can't create archive object
750 else {
751 $log->error('Unable to extract from primary archive ' . $input[0]);
752 exit(1);
753 };
754 };
755
Akrone1dbc382016-07-08 22:24:52 +0200756 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100757
Akron7d4cdd82016-08-17 21:39:45 +0200758 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100759 my $pool = Parallel::ForkManager->new($jobs);
760
Akron7d4cdd82016-08-17 21:39:45 +0200761 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100762 my $iter = 1; # Current text in process
763
764 # Report on fork message
765 $pool->run_on_finish (
766 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200767 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100768 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200769
Akron08385f62016-03-22 20:37:04 +0100770 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200771 ($iter++) . "/$count]" .
772 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200773 ' ' . $data->[0] . "\n";
774 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100775 }
776 );
777
778 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200779 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100780 print "Reading data ...\n";
781
Akron7d4cdd82016-08-17 21:39:45 +0200782 # unless (Cache::FastMmap->new(
783 # share_file => $cache_file,
784 # cache_size => $cache_size,
785 # init_file => $cache_init
786 # )) {
787 # print "Unable to intialize cache '$cache_file'\n\n";
788 # exit(1);
789 # };
Akron11c80302016-03-18 19:44:43 +0100790
Akron941c1a62016-02-23 17:41:41 +0100791 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100792 if (-d $input[0]) {
793 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100794 my @dirs;
795 my $dir;
796
Akron7d4cdd82016-08-17 21:39:45 +0200797 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100798 while (1) {
799 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200800 push @dirs, $dir;
801 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100802 };
803 last unless $it->next;
804 };
805
806 print "Start processing ...\n";
807 $t = Benchmark->new;
808 $count = scalar @dirs;
809
810 DIRECTORY_LOOP:
811 for (my $i = 0; $i < $count; $i++) {
812
Akrone1dbc382016-07-08 22:24:52 +0200813 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200814 $output,
815 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200816 );
Akron941c1a62016-02-23 17:41:41 +0100817
818 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200819 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200820
Akron13d56622016-10-31 14:54:49 +0100821 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
822 $pool->finish(
823 0,
824 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
825 );
Akron3ec48972016-08-17 23:24:52 +0200826 }
827 else {
Akron4c0cf312016-10-15 16:42:09 +0200828 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200829 };
Akron941c1a62016-02-23 17:41:41 +0100830 };
831 }
832
833 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200834 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200835
Akron941c1a62016-02-23 17:41:41 +0100836 unless ($archive->test_unzip) {
837 print "Unzip is not installed or incompatible.\n\n";
838 exit(1);
839 };
840
Akron08385f62016-03-22 20:37:04 +0100841 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200842 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100843
Akron941c1a62016-02-23 17:41:41 +0100844 print "Start processing ...\n";
845 $t = Benchmark->new;
846 my @dirs = $archive->list_texts;
847 $count = scalar @dirs;
848
849 ARCHIVE_LOOP:
850 for (my $i = 0; $i < $count; $i++) {
851
852 # Split path information
853 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
854
Akrone1dbc382016-07-08 22:24:52 +0200855 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200856 $output,
857 get_file_name(
858 catfile($corpus, $doc, $text)
859 . '.json' . ($gzip ? '.gz' : '')
860 )
Akrone1dbc382016-07-08 22:24:52 +0200861 );
Akron941c1a62016-02-23 17:41:41 +0100862
863 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200864 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100865
Akron4c0cf312016-10-15 16:42:09 +0200866 # Create temporary file
867 $temp = File::Temp->newdir;
868
Akronbdf434a2016-10-24 17:42:07 +0200869 # TODO: Check if $filename exist at the beginning,
870 # because extraction can be horrible slow!
871
Akron941c1a62016-02-23 17:41:41 +0100872 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200873 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100874
Akron7d4cdd82016-08-17 21:39:45 +0200875 # Create corpus directory
876 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100877
Akron7d4cdd82016-08-17 21:39:45 +0200878 # Temporary directory
879 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100880
Akron7d4cdd82016-08-17 21:39:45 +0200881 # Write file
Akron13d56622016-10-31 14:54:49 +0100882 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200883 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100884 $pool->finish(
885 0,
886 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
887 );
888 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200889 }
890 else {
Akron4c0cf312016-10-15 16:42:09 +0200891 # Delete temporary file
892 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200893 };
Akron941c1a62016-02-23 17:41:41 +0100894 }
Akron7d4cdd82016-08-17 21:39:45 +0200895
896 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100897 else {
Akron4c0cf312016-10-15 16:42:09 +0200898 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100899 };
900 };
901 }
902
903 else {
904 print "Input is neither a directory nor an archive.\n\n";
905 };
906
907 $pool->wait_all_children;
908
Akron11c80302016-03-18 19:44:43 +0100909 # Delete cache file
910 unlink($cache_file) if $cache_delete;
911
Akron63f20d42017-04-10 23:40:29 +0200912 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100913 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200914};
Akron941c1a62016-02-23 17:41:41 +0100915
Nils Diewald2db9ad02013-10-29 19:26:43 +0000916
Akron63f20d42017-04-10 23:40:29 +0200917# Cleanup temporary extraction directory
918if ($extract_dir) {
919 my $objects = remove_tree($extract_dir, { safe => 1 });
920 print "Removed directory $extract_dir with $objects objects.\n";
921};
922
923
924print "\n";
925
Nils Diewald2db9ad02013-10-29 19:26:43 +0000926__END__
Akron941c1a62016-02-23 17:41:41 +0100927
928=pod
929
930=encoding utf8
931
932=head1 NAME
933
Akronf7ad89e2016-03-16 18:22:47 +0100934korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100935
936
937=head1 SYNOPSIS
938
Akrona76d8352016-10-27 16:27:32 +0200939 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100940
Akron2fd402b2016-10-27 21:26:48 +0200941
Akron941c1a62016-02-23 17:41:41 +0100942=head1 DESCRIPTION
943
944L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
945compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100946The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100947
948
949=head1 INSTALLATION
950
951The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
952
Akronaf386982016-10-12 00:33:25 +0200953 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100954
Akronc13a1702016-03-15 19:33:14 +0100955In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100956be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200957Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200958In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100959
960=head1 ARGUMENTS
961
Akrona76d8352016-10-27 16:27:32 +0200962 $ korapxml2krill -z --input <directory> --output <filename>
963
964Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200965It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200966
Akron941c1a62016-02-23 17:41:41 +0100967=over 2
968
969=item B<archive>
970
Akrona76d8352016-10-27 16:27:32 +0200971 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
972
Akron2fd402b2016-10-27 21:26:48 +0200973Converts an archive of KorAP-XML documents. It expects a directory
974(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100975
976=item B<extract>
977
Akrona76d8352016-10-27 16:27:32 +0200978 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
979
980Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100981
Akron63f20d42017-04-10 23:40:29 +0200982=item B<serial>
983
984 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
985
986Convert archives sequentially. The inputs are not merged but treated
987as they are (so they may be premerged or globs).
988the C<--out> directory is treated as the base directory where subdirectories
989are created based on the archive name.
990
991
Akron941c1a62016-02-23 17:41:41 +0100992=back
993
994
995=head1 OPTIONS
996
997=over 2
998
Akrona76d8352016-10-27 16:27:32 +0200999=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001000
Akrona76d8352016-10-27 16:27:32 +02001001Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001002
Akron7606afa2016-10-25 16:23:49 +02001003Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001004document, while C<archive> expects a KorAP-XML corpus folder or a zip
1005file to batch process multiple files.
1006C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001007
Akrona76d8352016-10-27 16:27:32 +02001008C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001009that the first archive listed contains all primary data files
1010and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001011
Akron7606afa2016-10-25 16:23:49 +02001012 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001013
Akron821db3d2017-04-06 21:19:31 +02001014Input may also be defined using BSD glob wildcards.
1015
1016 -i 'file/news*.zip'
1017
1018The extended input array will be sorted in length order, so the shortest
1019path needs to contain all primary data files and all meta data files.
1020
Akron0c3e3752016-06-28 15:55:53 +02001021(The directory structure follows the base directory format,
1022that may include a C<.> root folder.
1023In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001024need to be passed with a hash sign in front of the archive's name.
1025This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001026
Akron7606afa2016-10-25 16:23:49 +02001027To support zip files, a version of C<unzip> needs to be installed that is
1028compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001029
Akron7606afa2016-10-25 16:23:49 +02001030B<The root folder switch using the hash sign is experimental and
1031may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001032
Akron63f20d42017-04-10 23:40:29 +02001033=item B<--input-base|-ib> <directory>
1034
1035The base directory for inputs.
1036
1037
Akron941c1a62016-02-23 17:41:41 +01001038=item B<--output|-o> <directory|file>
1039
1040Output folder for archive processing or
1041document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001042writes to C<STDOUT> by default
1043(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001044
1045=item B<--overwrite|-w>
1046
1047Overwrite files that already exist.
1048
Akron3741f8b2016-12-21 19:55:21 +01001049=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001050
1051Define the default tokenization by specifying
1052the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001053of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001054
Akron3741f8b2016-12-21 19:55:21 +01001055
1056=item B<--base-sentences|-bs> <foundry>#<layer>
1057
1058Define the layer for base sentences.
1059If given, this will be used instead of using C<Base#Sentences>.
1060Currently C<DeReKo#Structure> is the only additional layer supported.
1061
1062 Defaults to unset.
1063
1064
1065=item B<--base-paragraphs|-bp> <foundry>#<layer>
1066
1067Define the layer for base paragraphs.
1068If given, this will be used instead of using C<Base#Paragraphs>.
1069Currently C<DeReKo#Structure> is the only additional layer supported.
1070
1071 Defaults to unset.
1072
1073
Akron41ac10b2017-02-08 22:47:25 +01001074=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1075
1076Define the layer for base pagebreaks.
1077Currently C<DeReKo#Structure> is the only layer supported.
1078
1079 Defaults to unset.
1080
1081
Akron941c1a62016-02-23 17:41:41 +01001082=item B<--skip|-s> <foundry>[#<layer>]
1083
Akronf7ad89e2016-03-16 18:22:47 +01001084Skip specific annotations by specifying the foundry
1085(and optionally the layer with a C<#>-prefix),
1086e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001087Can be set multiple times.
1088
Akronc13a1702016-03-15 19:33:14 +01001089=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001090
Akronf7ad89e2016-03-16 18:22:47 +01001091Convert specific annotations by specifying the foundry
1092(and optionally the layer with a C<#>-prefix),
1093e.g. C<Mate> or C<Mate#Morpho>.
1094Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001095
1096=item B<--primary|-p>
1097
Akronc13a1702016-03-15 19:33:14 +01001098Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001099Can be flagged using C<--no-primary> as well.
1100This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001101
1102=item B<--jobs|-j>
1103
1104Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001105for archive processing.
Akron11c80302016-03-18 19:44:43 +01001106Defaults to C<0> (everything runs in a single process).
Akronc11f7982017-02-21 21:20:14 +01001107Pass -1, and the value will be set automatically to 5
1108times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001109This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001110
Akron35db6e32016-03-17 22:42:22 +01001111=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001112
Akron35db6e32016-03-17 22:42:22 +01001113Define the metadata parser to use. Defaults to C<I5>.
1114Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1115This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001116
1117=item B<--pretty|-y>
1118
Akronc13a1702016-03-15 19:33:14 +01001119Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001120This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001121
1122=item B<--gzip|-z>
1123
Akronf7ad89e2016-03-16 18:22:47 +01001124Compress the output.
1125Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001126
Akron11c80302016-03-18 19:44:43 +01001127=item B<--cache|-c>
1128
1129File to mmap a cache (using L<Cache::FastMmap>).
1130Defaults to C<korapxml2krill.cache> in the calling directory.
1131
1132=item B<--cache-size|-cs>
1133
1134Size of the cache. Defaults to C<50m>.
1135
1136=item B<--cache-init|-ci>
1137
1138Initialize cache file.
1139Can be flagged using C<--no-cache-init> as well.
1140Defaults to C<true>.
1141
1142=item B<--cache-delete|-cd>
1143
1144Delete cache file after processing.
1145Can be flagged using C<--no-cache-delete> as well.
1146Defaults to C<true>.
1147
Akron636aa112017-04-07 18:48:56 +02001148=item B<--config|-cfg>
1149
1150Configure the parameters of your call in a file
1151of key-value pairs with whitespace separator
1152
1153 overwrite 1
1154 token DeReKo#Structure
1155 ...
1156
1157Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001158C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001159C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron81500102017-04-07 20:45:44 +02001160C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
Akron636aa112017-04-07 18:48:56 +02001161C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
1162(semicolon separated), C<anno> (semicolon separated).
1163
Akron81500102017-04-07 20:45:44 +02001164=item B<--temporary-extract|-te>
1165
1166Only valid for the C<archive> command.
1167
1168This will first extract all files into a
1169directory and then will archive.
1170If the directory is given as C<:temp:>,
1171a temporary directory is used.
1172This is especially useful to avoid
1173massive unzipping and potential
1174network latency.
Akron636aa112017-04-07 18:48:56 +02001175
Akrone10ad322016-02-27 10:54:26 +01001176=item B<--sigle|-sg>
1177
Akron20807582016-10-26 17:11:34 +02001178Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001179Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001180I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001181Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001182In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001183On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001184
Akron941c1a62016-02-23 17:41:41 +01001185=item B<--log|-l>
1186
1187The L<Log4perl> log level, defaults to C<ERROR>.
1188
1189=item B<--help|-h>
1190
1191Print this document.
1192
1193=item B<--version|-v>
1194
1195Print version information.
1196
1197=back
1198
Akronc13a1702016-03-15 19:33:14 +01001199=head1 ANNOTATION SUPPORT
1200
1201L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1202developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1203The base foundry with paragraphs, sentences, and the text element are mandatory for
1204L<Krill|https://github.com/KorAP/Krill>.
1205
Akron821db3d2017-04-06 21:19:31 +02001206 Base
1207 #Paragraphs
1208 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001209
Akron821db3d2017-04-06 21:19:31 +02001210 Connexor
1211 #Morpho
1212 #Phrase
1213 #Sentences
1214 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001215
Akron821db3d2017-04-06 21:19:31 +02001216 CoreNLP
1217 #Constituency
1218 #Morpho
1219 #NamedEntities
1220 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001221
Akron821db3d2017-04-06 21:19:31 +02001222 DeReKo
1223 #Structure
Akronc13a1702016-03-15 19:33:14 +01001224
Akron821db3d2017-04-06 21:19:31 +02001225 DRuKoLa
1226 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001227
Akron821db3d2017-04-06 21:19:31 +02001228 Glemm
1229 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001230
Akron821db3d2017-04-06 21:19:31 +02001231 Malt
1232 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001233
Akron821db3d2017-04-06 21:19:31 +02001234 MarMoT
1235 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001236
Akron821db3d2017-04-06 21:19:31 +02001237 Mate
1238 #Dependency
1239 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001240
Akron821db3d2017-04-06 21:19:31 +02001241 MDParser
1242 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001243
Akron821db3d2017-04-06 21:19:31 +02001244 OpenNLP
1245 #Morpho
1246 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001247
Akron821db3d2017-04-06 21:19:31 +02001248 Sgbr
1249 #Lemma
1250 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001251
Akron821db3d2017-04-06 21:19:31 +02001252 TreeTagger
1253 #Morpho
1254 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001255
Akron821db3d2017-04-06 21:19:31 +02001256 XIP
1257 #Constituency
1258 #Morpho
1259 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001260
Akronc13a1702016-03-15 19:33:14 +01001261
1262More importers are in preparation.
1263New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1264See the built-in annotation importers as examples.
1265
Akron941c1a62016-02-23 17:41:41 +01001266=head1 AVAILABILITY
1267
1268 https://github.com/KorAP/KorAP-XML-Krill
1269
1270
1271=head1 COPYRIGHT AND LICENSE
1272
Akron3ec0a1c2017-01-18 14:41:55 +01001273Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001274
Akron941c1a62016-02-23 17:41:41 +01001275Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001276
Akrona76d8352016-10-27 16:27:32 +02001277Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001278
1279L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1280Corpus Analysis Platform at the
1281L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1282member of the
1283L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1284
1285This program is free software published under the
1286L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1287
1288=cut