blob: a6aa95fe221b46b7f56016e8e80a248ee288634a [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
27use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020029use IO::File;
30use Archive::Tar::Builder;
Akronc11f7982017-02-21 21:20:14 +010031
32# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010033# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010034# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010035
Akronc11f7982017-02-21 21:20:14 +010036# TODO: Use KorAP::XML::ForkPool!
37
Akron941c1a62016-02-23 17:41:41 +010038# CHANGES:
39# ----------------------------------------------------------
40# 2013/11/25
41# - Initial release
42#
43# 2014/10/29
44# - Merges foundry data to create indexer friendly documents
45#
Akron93d620e2016-02-05 19:40:05 +010046# 2016/02/04
47# - renamed to korapxml2krill
48# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010049#
50# 2016/02/12
51# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010052# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010053#
54# 2016/02/14
55# - Added version information
Akron941c1a62016-02-23 17:41:41 +010056# - Added support for archive files
57#
58# 2016/02/15
59# - Fixed temporary directory bug
60# - Improved skipping before unzipping
61# - Added EXPERIMENTAL concurrency support
62#
63# 2016/02/23
64# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010065#
66# 2016/02/27
67# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010068#
69# 2016/03/17
70# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010071#
72# 2016/03/18
73# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020074#
Akronf3f0c942016-06-27 13:27:14 +020075# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020076# - Added multi archive support
77# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020078# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020079#
80# 2016/07/06
81# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020082#
83# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020084# - Fixed temporary path issue in script
85#
86# 2016/10/24
87# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020088#
Akronb4bbec72016-10-26 20:21:02 +020089# 2016/10/24
90# - Added support for document extraction
91#
Akron3741f8b2016-12-21 19:55:21 +010092# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020093# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020094#
Akron3741f8b2016-12-21 19:55:21 +010095# 2016/12/21
96# - added support for base-sentences and base-tokenizations
97#
Akron4fa37c32017-01-20 14:43:10 +010098# 2017/01/20
99# - added support for DRuKoLa annotations
100#
Akron41ac10b2017-02-08 22:47:25 +0100101# 2017/02/08
102# - added support for pagebreak annotations
103#
Akron821db3d2017-04-06 21:19:31 +0200104# 2017/04/06
105# - added support for wildcards in input
106#
Akron636aa112017-04-07 18:48:56 +0200107# 2017/04/07
108# - support configuration option
Akron81500102017-04-07 20:45:44 +0200109# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200110#
Akron9ec88872017-04-12 16:29:06 +0200111# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200112# - support serial processing
113# - support input root
Akron9ec88872017-04-12 16:29:06 +0200114# - introduced --sequential-extraction flag
Akron941c1a62016-02-23 17:41:41 +0100115# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100116
Akron9ec88872017-04-12 16:29:06 +0200117our $LAST_CHANGE = '2017/04/12';
Akron941c1a62016-02-23 17:41:41 +0100118our $LOCAL = $FindBin::Bin;
119our $VERSION_MSG = <<"VERSION";
120Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
121VERSION
122
Akron63f20d42017-04-10 23:40:29 +0200123# Prototypes
124sub get_file_name_from_glob($);
125sub get_file_name($);
126
Akron941c1a62016-02-23 17:41:41 +0100127# Parse comand
128my $cmd;
129our @ARGV;
130if ($ARGV[0] && index($ARGV[0], '-') != 0) {
131 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100132};
Akron63f20d42017-04-10 23:40:29 +0200133my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100134
Akron5f51d422016-08-16 16:26:43 +0200135my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100136my $text;
Akrone10ad322016-02-27 10:54:26 +0100137
Akron941c1a62016-02-23 17:41:41 +0100138# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000139GetOptions(
Akron08385f62016-03-22 20:37:04 +0100140 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200141 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100142 'output|o=s' => \(my $output),
143 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100144 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200145 'token|t=s' => \(my $token_base),
146 'base-sentences|bs=s' => \(my $base_sentences),
147 'base-paragraphs|bp=s' => \(my $base_paragraphs),
148 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100149 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200150 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100151 'skip|s=s' => \@skip,
152 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200153 'cache|c=s' => \(my $cache_file),
154 'config|cfg=s' => \(my $cfg_file),
155 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200156 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100157 'primary|p!' => \(my $primary),
158 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200159 'jobs|j=i' => \(my $jobs),
Akron486f9ab2017-04-22 23:25:19 +0200160 'to-tar' => \(my $to_tar),
Akron9ec88872017-04-12 16:29:06 +0200161 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200162 'cache-size|cs=s' => \(my $cache_size),
163 'cache-delete|cd!' => \(my $cache_delete),
164 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100165 'help|h' => sub {
166 pod2usage(
167 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200168 -verbose => 99,
169 -msg => $VERSION_MSG,
170 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100171 );
172 },
173 'version|v' => sub {
174 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200175 -verbose => 0,
176 -msg => $VERSION_MSG,
177 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100178 )
179 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000180);
181
Akron63f20d42017-04-10 23:40:29 +0200182
Akron636aa112017-04-07 18:48:56 +0200183# Load from configuration
184if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200185 my %config;
186
187 Config::Simple->import_from($cfg_file, \%config);
188
189 # Overwrite
190 if (!defined($overwrite) && defined $config{overwrite}) {
191 $overwrite = $config{overwrite};
192 };
193
194 # Gzip
195 if (!defined($gzip) && defined $config{gzip}) {
196 $gzip = $config{gzip};
197 };
198
199 # Jobs
200 if (!defined($jobs) && defined $config{jobs}) {
201 $jobs = $config{jobs};
202 };
203
Akron63f20d42017-04-10 23:40:29 +0200204 # Input root base directory
205 if (!defined($input_base) && defined $config{'input-base'}) {
206 $input_base = $config{'input-base'};
207 };
208
Akron81500102017-04-07 20:45:44 +0200209 # temporary-extract
210 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
211 $extract_dir = $config{'temporary-extract'};
212 };
213
Akron636aa112017-04-07 18:48:56 +0200214 # Token base
215 if (!defined($token_base) && defined $config{token}) {
216 $token_base = $config{token};
217 };
218
219 # Cache file
220 if (!defined($cache_file) && defined $config{cache}) {
221 $cache_file = $config{cache};
222 };
223
224 # Cache size
225 if (!defined($cache_size) && defined $config{'cache-size'}) {
226 $cache_size = $config{'cache-size'};
227 };
228
229 # Cache delete
230 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
231 $cache_delete = $config{'cache-delete'} ;
232 };
233
234 # Cache init
235 if (!(defined $cache_init) && defined $config{'cache-init'}) {
236 $cache_init = $config{'cache-init'} ;
237 };
238
Akron9ec88872017-04-12 16:29:06 +0200239 # Jobs for extraction
240 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
241 $sequential_extraction = $config{'sequential-extraction'} ;
242 };
243
Akron636aa112017-04-07 18:48:56 +0200244 # Meta
245 if (!(defined $meta) && defined $config{'meta'}) {
246 $meta = $config{'meta'} ;
247 };
248
249 # Output
250 if (!(defined $output) && defined $config{'output'}) {
251 $output = $config{'output'} ;
252 };
253
254 # Base-sentences
255 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
256 $base_sentences = $config{'base-sentences'} ;
257 };
258
259 # Base-paragraphs
260 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
261 $base_paragraphs = $config{'base-paragraphs'} ;
262 };
263
264 # Base-pagebreaks
265 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
266 $base_pagebreaks = $config{'base-pagebreaks'} ;
267 };
268
Akron081639e2017-04-21 19:01:39 +0200269 # Write to tar
270 if (!(defined $to_tar) && defined $config{'to-tar'}) {
271 $to_tar = $config{'to-tar'} ;
272 };
273
Akron636aa112017-04-07 18:48:56 +0200274 # Log
275 if (!(defined $log_level) && defined $config{'log'}) {
276 $log_level = $config{'log'} ;
277 };
278
279 # Skip
280 if (!scalar(@skip) && defined $config{'skip'}) {
281 @skip = split /\s*;\s*/, $config{'skip'} ;
282 };
283
284 # Sigle
285 if (!scalar(@sigle) && defined $config{'sigle'}) {
286 @sigle = split /\s*;\s*/, $config{'sigle'} ;
287 };
288
289 # Anno
290 if (!scalar(@anno) && defined $config{'anno'}) {
291 @anno = split /\s*;\s*/, $config{'anno'} ;
292 };
293};
294
Akron63f20d42017-04-10 23:40:29 +0200295
Akron636aa112017-04-07 18:48:56 +0200296# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200297$token_base //= 'OpenNLP#tokens';
298$cache_file //= 'korapxml2krill.cache';
299$cache_size //= '50m';
300$jobs //= 0;
301$cache_delete //= 1;
302$cache_init //= 1;
303$sequential_extraction //= 0;
304$log_level //= 'ERROR';
305$base_sentences //= '';
306$base_paragraphs //= '';
307$base_pagebreaks //= '';
Akron636aa112017-04-07 18:48:56 +0200308
Akron821db3d2017-04-06 21:19:31 +0200309$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100310$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100311$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100312
Akron63f20d42017-04-10 23:40:29 +0200313
314# Initialize log4perl object
315Log::Log4perl->init({
316 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
317 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
318 'log4perl.appender.STDERR.layout' => 'PatternLayout',
319 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
320});
321
322my $log = Log::Log4perl->get_logger('main');
323
324
325print "Reading config from $cfg_file\n" if $cfg_file;
326
327
Akron941c1a62016-02-23 17:41:41 +0100328my %ERROR_HASH = (
329 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200330 -verbose => 99,
331 -msg => $VERSION_MSG,
332 -output => '-',
333 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100334);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000335
Akron941c1a62016-02-23 17:41:41 +0100336# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100337pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000338
Akrone1dbc382016-07-08 22:24:52 +0200339# Gzip has no effect, if no output is given
340pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000341
Akronc11f7982017-02-21 21:20:14 +0100342
Akron636aa112017-04-07 18:48:56 +0200343if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100344 state $cores = Sys::Info->new->device('CPU')->count;
345 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200346 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100347};
348
Akron821db3d2017-04-06 21:19:31 +0200349
Akron63f20d42017-04-10 23:40:29 +0200350# Start serial processing
351if ($cmd eq 'serial') {
352
Akron486f9ab2017-04-22 23:25:19 +0200353 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron63f20d42017-04-10 23:40:29 +0200354 print "Directory '$output' does not exist.\n\n";
355 exit(0);
356 };
357
358 # Remove all inputs
359 my $remove_next = 0;
360 @keep_argv = @{c(@keep_argv)->grep(
361 sub {
362 # Input flag
363 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
364 $remove_next = 1;
365 return 0;
366 }
367
368 # input value
369 elsif ($remove_next) {
370 $remove_next = 0;
371 return 0;
372 };
373
374 # Pass parameter
375 return 1;
376 }
377 )->to_array};
378
379
380 # Iterate over all inputs
381 foreach (@input) {
382
Akron081639e2017-04-21 19:01:39 +0200383 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200384 my $new_out = catdir($output, get_file_name_from_glob($_));
385
Akron486f9ab2017-04-22 23:25:19 +0200386 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200387 unless ($to_tar) {
388 if (make_path($new_out) == 0 && !-d $new_out) {
389 $log->error("Can\'t create path $new_out");
390 exit(0);
391 };
Akron63f20d42017-04-10 23:40:29 +0200392 };
393
394 # Create archive command
395 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
396 print "Start serial processing of $_ to $new_out\n";
397
398 # Start archiving
399 system @archive_cmd;
400 };
401
402 exit(0);
403};
404
Akrone1dbc382016-07-08 22:24:52 +0200405my %skip;
406$skip{lc($_)} = 1 foreach @skip;
407
408my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100409push(@layers, ['Base', 'Sentences']) unless $base_sentences;
410push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200411
412# Connexor
413push(@layers, ['Connexor', 'Morpho']);
414push(@layers, ['Connexor', 'Syntax']);
415push(@layers, ['Connexor', 'Phrase']);
416push(@layers, ['Connexor', 'Sentences']);
417
418# CoreNLP
419push(@layers, ['CoreNLP', 'NamedEntities']);
420push(@layers, ['CoreNLP', 'Sentences']);
421push(@layers, ['CoreNLP', 'Morpho']);
422push(@layers, ['CoreNLP', 'Constituency']);
423
Akron3741f8b2016-12-21 19:55:21 +0100424
Akrone1dbc382016-07-08 22:24:52 +0200425# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100426my @dereko_attr = ();
427if ($base_sentences eq 'dereko#structure') {
428 push @dereko_attr, 'sentences';
429};
430if ($base_paragraphs eq 'dereko#structure') {
431 push @dereko_attr, 'paragraphs';
432};
Akron636bd9c2017-02-09 17:13:00 +0100433
Akron41ac10b2017-02-08 22:47:25 +0100434if ($base_pagebreaks eq 'dereko#structure') {
435 push @dereko_attr, 'pagebreaks';
436};
437
438if ($dereko_attr[0]) {
439 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100440}
441else {
442 push(@layers, ['DeReKo', 'Structure']);
443};
Akrone1dbc382016-07-08 22:24:52 +0200444
445# Glemm
446push(@layers, ['Glemm', 'Morpho']);
447
448# Malt
449push(@layers, ['Malt', 'Dependency']);
450
451# MDParser
452push(@layers, ['MDParser', 'Dependency']);
453
454# Mate
455push(@layers, ['Mate', 'Morpho']);
456push(@layers, ['Mate', 'Dependency']);
457
458# OpenNLP
459push(@layers, ['OpenNLP', 'Morpho']);
460push(@layers, ['OpenNLP', 'Sentences']);
461
462# Schreibgebrauch
463push(@layers, ['Sgbr', 'Lemma']);
464push(@layers, ['Sgbr', 'Morpho']);
465
466# TreeTagger
467push(@layers, ['TreeTagger', 'Morpho']);
468push(@layers, ['TreeTagger', 'Sentences']);
469
470# XIP
471push(@layers, ['XIP', 'Morpho']);
472push(@layers, ['XIP', 'Constituency']);
473push(@layers, ['XIP', 'Sentences']);
474push(@layers, ['XIP', 'Dependency']);
475
Akron4fa37c32017-01-20 14:43:10 +0100476# DRuKoLa
477push(@layers, ['DRuKoLa', 'Morpho']);
478
Akron3bd942f2017-02-20 20:09:14 +0100479# Marmot
480push(@layers, ['MarMoT', 'Morpho']);
481
Akron4fa37c32017-01-20 14:43:10 +0100482
Akrone1dbc382016-07-08 22:24:52 +0200483# Check filters
484my @filtered_anno;
485if ($skip{'#all'}) {
486 foreach (@anno) {
487 push @filtered_anno, [ split('#', $_) ];
488 };
489}
490
491# Add all annotations that are not skipped
492else {
493 # Add to index file - respect skipping
494 foreach my $info (@layers) {
495 # Skip if Foundry or Foundry#Layer should be skipped
496 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
497 push @filtered_anno, $info;
498 };
499 };
500};
501
502# Get tokenization basis
503my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
504
505# TODO: This should not be initialized for batch
506my $cache = Cache::FastMmap->new(
507 share_file => $cache_file,
508 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200509 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200510);
511
Akron03b24db2016-08-16 20:54:32 +0200512# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200513my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200514 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200515 meta_type => $meta,
516 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200517 foundry => $token_base_foundry,
518 layer => $token_base_layer,
519 gzip => $gzip,
520 log => $log,
521 primary => $primary,
522 pretty => $pretty,
523 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200524);
525
Akron941c1a62016-02-23 17:41:41 +0100526# Get file name based on path information
527sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100528 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200529 if (-d $i) {
530 $i =~ s![^\/]+$!!;
531 };
Akron941c1a62016-02-23 17:41:41 +0100532 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200533
534 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200535 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100536 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100537 $file =~ tr/\//-/;
538 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200539 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100540 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000541};
542
Akron63f20d42017-04-10 23:40:29 +0200543
544sub get_file_name_from_glob ($) {
545 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200546 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200547 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
548 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
549 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
550 $glob =~ s/^-//; # Clean beginning
551 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200552 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200553 return $glob;
554};
555
556
Akrone10ad322016-02-27 10:54:26 +0100557# Convert sigle to path construct
558s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
559
Akron7d4cdd82016-08-17 21:39:45 +0200560if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200561 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron7d4cdd82016-08-17 21:39:45 +0200562 print "Directory '$output' does not exist.\n\n";
563 exit(0);
564 };
565};
566
Akron63f20d42017-04-10 23:40:29 +0200567
568# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200569if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200570
Akron821db3d2017-04-06 21:19:31 +0200571 my @new_input = ();
572
573 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200574 foreach my $wild_card (@input) {
575
576 # Prefix with input root
577 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
578
579 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200580 };
581
Akron63f20d42017-04-10 23:40:29 +0200582 # Sort files by length
583 @input = sort { length($a) <=> length($b) } @new_input;
584
585 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200586};
587
588
Akron941c1a62016-02-23 17:41:41 +0100589# Process a single file
590unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100591 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000592
Akron941c1a62016-02-23 17:41:41 +0100593 BEGIN {
594 $main::TIME = Benchmark->new;
595 $main::LAST_STOP = Benchmark->new;
596 };
597
598 sub stop_time {
599 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200600 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100601 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200602 timestr(timediff($new, $main::LAST_STOP)) .
603 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
604 );
Akron941c1a62016-02-23 17:41:41 +0100605 $main::LAST_STOP = $new;
606 };
607
608 # Create and parse new document
609 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100610
Akron7d4cdd82016-08-17 21:39:45 +0200611 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200612 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100613
Akron11c80302016-03-18 19:44:43 +0100614 # Delete cache file
615 unlink($cache_file) if $cache_delete;
616
Akron5f51d422016-08-16 16:26:43 +0200617 stop_time;
Akron81500102017-04-07 20:45:44 +0200618 exit(1);
619};
620
Nils Diewald59094f22014-11-05 18:20:50 +0000621
Akrone10ad322016-02-27 10:54:26 +0100622# Extract XML files
Akron81500102017-04-07 20:45:44 +0200623if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100624
Akron7d4cdd82016-08-17 21:39:45 +0200625 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200626 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100627
Akron7d4cdd82016-08-17 21:39:45 +0200628 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100629 unless ($archive->test_unzip) {
630 print "Unzip is not installed or incompatible.\n\n";
Akron81500102017-04-07 20:45:44 +0200631 exit(0);
Akrone10ad322016-02-27 10:54:26 +0100632 };
633
Akronb0c88db2016-06-29 16:33:18 +0200634 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200635 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200636
Akron651cb8d2016-08-16 21:44:49 +0200637 my $prefix = 1;
638
Akron03b24db2016-08-16 20:54:32 +0200639 # No sigles given
640 unless (@sigle) {
641
642 # Get files
643 foreach ($archive->list_texts) {
644
645 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200646 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200647
648 # TODO: Make this OS independent
649 push @sigle, join '/', $corpus, $doc, $text;
650 };
Akron20807582016-10-26 17:11:34 +0200651 }
652
653 # Check sigle for doc sigles
654 else {
655 my @new_sigle;
656
657 my $prefix_check = 0;
658
659 # Iterate over all sigle
660 foreach (@sigle) {
661
662 # Sigle is a doc sigle
663 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200664
Akron60a8caa2017-02-17 21:51:27 +0100665 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200666 # Check if a prefix is needed
667 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100668
669 if ($prefix = $archive->check_prefix) {
670 print " with prefix ...";
671 };
Akron20807582016-10-26 17:11:34 +0200672 $prefix_check = 1;
673 };
674
Akron60a8caa2017-02-17 21:51:27 +0100675 print "\n";
676
Akron20807582016-10-26 17:11:34 +0200677 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200678 my $path = ($prefix ? './' : '') . $_;
679
680 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200681 $archive->extract_doc(
Akron9ec88872017-04-12 16:29:06 +0200682 $path, $output, $sequential_extraction ? 1 : $jobs
Akron20807582016-10-26 17:11:34 +0200683 ) ? '' : 'not '
684 );
685 print "extracted.\n";
686 }
Akron60a8caa2017-02-17 21:51:27 +0100687
688 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200689 else {
690 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100691
692 unless ($prefix_check) {
693
694 if ($prefix = $archive->check_prefix) {
695 print " with prefix ...";
696 };
697 $prefix_check = 1;
698 };
Akron20807582016-10-26 17:11:34 +0200699 };
700 };
701 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200702 };
703
Akrone10ad322016-02-27 10:54:26 +0100704 # Iterate over all given sigles and extract
705 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100706
Akron2812ba22016-10-28 21:55:59 +0200707 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200708
Akron03b24db2016-08-16 20:54:32 +0200709 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200710 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100711
Akron20807582016-10-26 17:11:34 +0200712 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200713 ($prefix ? './' : '') . $_, $output
714 ) ? '' : 'not '
715 );
Akrone10ad322016-02-27 10:54:26 +0100716 print "extracted.\n";
717 };
Akronb0c88db2016-06-29 16:33:18 +0200718 }
Akron7d4cdd82016-08-17 21:39:45 +0200719
720 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200721 else {
722 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron81500102017-04-07 20:45:44 +0200723 exit(1);
Akrone10ad322016-02-27 10:54:26 +0100724 };
725}
726
Akron81500102017-04-07 20:45:44 +0200727
Akron941c1a62016-02-23 17:41:41 +0100728# Process an archive
729elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000730
Akron81500102017-04-07 20:45:44 +0200731 my $archive_output;
732
733 # First extract, then archive
734 if (defined $extract_dir) {
735
736 # Create new archive object
737 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
738
739 # Check zip capabilities
740 unless ($archive->test_unzip) {
741 print "Unzip is not installed or incompatible.\n\n";
742 exit(0);
743 };
744
745 # Add further annotation archived
746 $archive->attach($_) foreach @input[1..$#input];
747
748 # Create a temporary directory
749 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200750 $extract_dir = tempdir(CLEANUP => 0);
751 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200752 };
753
Akron63f20d42017-04-10 23:40:29 +0200754 # Add some random extra to avoid clashes with multiple archives
755 $extract_dir = catdir($extract_dir, random_string('cccccc'));
756
757 # Extract to temprary directory
Akron9ec88872017-04-12 16:29:06 +0200758 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200759 @input = ($extract_dir);
760 }
761 else {
762 $log->error('Unable to extract from primary archive ' . $input[0] .
763 ' to ' . $extract_dir);
764 exit(1);
765 };
766 }
767
768 # Can't create archive object
769 else {
770 $log->error('Unable to extract from primary archive ' . $input[0]);
771 exit(1);
772 };
773 };
774
Akrone1dbc382016-07-08 22:24:52 +0200775 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100776
Akron7d4cdd82016-08-17 21:39:45 +0200777 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100778 my $pool = Parallel::ForkManager->new($jobs);
779
Akron7d4cdd82016-08-17 21:39:45 +0200780 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100781 my $iter = 1; # Current text in process
782
783 # Report on fork message
784 $pool->run_on_finish (
785 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200786 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100787 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200788
Akron08385f62016-03-22 20:37:04 +0100789 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200790 ($iter++) . "/$count]" .
791 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200792 ' ' . $data->[0] . "\n";
793 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100794 }
795 );
796
797 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200798 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100799 print "Reading data ...\n";
800
Akron7d4cdd82016-08-17 21:39:45 +0200801 # unless (Cache::FastMmap->new(
802 # share_file => $cache_file,
803 # cache_size => $cache_size,
804 # init_file => $cache_init
805 # )) {
806 # print "Unable to intialize cache '$cache_file'\n\n";
807 # exit(1);
808 # };
Akron11c80302016-03-18 19:44:43 +0100809
Akron486f9ab2017-04-22 23:25:19 +0200810 my $tar_archive;
811 my $output_dir = $output;
812
813 # Initialize tar archive
814 if ($to_tar) {
815 $tar_archive = Archive::Tar::Builder->new(
816 ignore_errors => 1
817 );
818
819 # Set output name
820 my $tar_file = $output;
821 unless ($tar_file =~ /\.tar$/) {
822 $tar_file .= '.tar';
823 };
824
825 # Initiate the tar file
826 print "Writing to file $tar_file\n";
827 my $fh = IO::File->new($tar_file, 'w');
828 $fh->binmode(1);
829
830 # Set handle
831 $tar_archive->set_handle($fh);
832
833 # Output to temporary directory
834 $output_dir = File::Temp->newdir;
835 };
836
837
Akron941c1a62016-02-23 17:41:41 +0100838 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100839 if (-d $input[0]) {
840 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100841 my @dirs;
842 my $dir;
843
Akron7d4cdd82016-08-17 21:39:45 +0200844 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100845 while (1) {
846 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200847 push @dirs, $dir;
848 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100849 };
850 last unless $it->next;
851 };
852
853 print "Start processing ...\n";
854 $t = Benchmark->new;
855 $count = scalar @dirs;
856
857 DIRECTORY_LOOP:
858 for (my $i = 0; $i < $count; $i++) {
859
Akrone1dbc382016-07-08 22:24:52 +0200860 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200861 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200862 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200863 );
Akron941c1a62016-02-23 17:41:41 +0100864
865 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200866 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200867
Akron13d56622016-10-31 14:54:49 +0100868 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron081639e2017-04-21 19:01:39 +0200869
870 # Add to tar archive
871 if ($to_tar) {
872 $tar_archive->archive($filename);
873 unlink $filename;
874 };
Akron486f9ab2017-04-22 23:25:19 +0200875
876 $pool->finish(
877 0,
878 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
879 );
Akron3ec48972016-08-17 23:24:52 +0200880 }
881 else {
Akron4c0cf312016-10-15 16:42:09 +0200882 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200883 };
Akron941c1a62016-02-23 17:41:41 +0100884 };
885 }
886
887 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200888 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200889
Akron941c1a62016-02-23 17:41:41 +0100890 unless ($archive->test_unzip) {
891 print "Unzip is not installed or incompatible.\n\n";
892 exit(1);
893 };
894
Akron08385f62016-03-22 20:37:04 +0100895 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200896 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100897
Akron941c1a62016-02-23 17:41:41 +0100898 print "Start processing ...\n";
899 $t = Benchmark->new;
900 my @dirs = $archive->list_texts;
901 $count = scalar @dirs;
902
903 ARCHIVE_LOOP:
904 for (my $i = 0; $i < $count; $i++) {
905
906 # Split path information
907 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
908
Akrone1dbc382016-07-08 22:24:52 +0200909 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200910 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200911 get_file_name(
912 catfile($corpus, $doc, $text)
913 . '.json' . ($gzip ? '.gz' : '')
914 )
Akrone1dbc382016-07-08 22:24:52 +0200915 );
Akron941c1a62016-02-23 17:41:41 +0100916
917 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200918 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100919
Akron4c0cf312016-10-15 16:42:09 +0200920 # Create temporary file
921 $temp = File::Temp->newdir;
922
Akronbdf434a2016-10-24 17:42:07 +0200923 # TODO: Check if $filename exist at the beginning,
924 # because extraction can be horrible slow!
925
Akron941c1a62016-02-23 17:41:41 +0100926 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200927 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100928
Akron7d4cdd82016-08-17 21:39:45 +0200929 # Create corpus directory
930 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100931
Akron7d4cdd82016-08-17 21:39:45 +0200932 # Temporary directory
933 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100934
Akron7d4cdd82016-08-17 21:39:45 +0200935 # Write file
Akron13d56622016-10-31 14:54:49 +0100936 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200937
938 # Add to tar archive
939 if ($to_tar) {
940 $tar_archive->archive($filename);
941 unlink $filename;
942 };
943
Akron4c0cf312016-10-15 16:42:09 +0200944 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100945 $pool->finish(
946 0,
947 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
948 );
949 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200950 }
951 else {
Akron4c0cf312016-10-15 16:42:09 +0200952 # Delete temporary file
953 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200954 };
Akron941c1a62016-02-23 17:41:41 +0100955 }
Akron7d4cdd82016-08-17 21:39:45 +0200956
957 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100958 else {
Akron4c0cf312016-10-15 16:42:09 +0200959 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100960 };
961 };
962 }
963
964 else {
965 print "Input is neither a directory nor an archive.\n\n";
966 };
967
968 $pool->wait_all_children;
969
Akron11c80302016-03-18 19:44:43 +0100970 # Delete cache file
971 unlink($cache_file) if $cache_delete;
972
Akron63f20d42017-04-10 23:40:29 +0200973 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100974 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200975};
Akron941c1a62016-02-23 17:41:41 +0100976
Nils Diewald2db9ad02013-10-29 19:26:43 +0000977
Akron63f20d42017-04-10 23:40:29 +0200978# Cleanup temporary extraction directory
979if ($extract_dir) {
980 my $objects = remove_tree($extract_dir, { safe => 1 });
981 print "Removed directory $extract_dir with $objects objects.\n";
982};
983
984
985print "\n";
986
Nils Diewald2db9ad02013-10-29 19:26:43 +0000987__END__
Akron941c1a62016-02-23 17:41:41 +0100988
989=pod
990
991=encoding utf8
992
993=head1 NAME
994
Akronf7ad89e2016-03-16 18:22:47 +0100995korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100996
997
998=head1 SYNOPSIS
999
Akrona76d8352016-10-27 16:27:32 +02001000 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001001
Akron2fd402b2016-10-27 21:26:48 +02001002
Akron941c1a62016-02-23 17:41:41 +01001003=head1 DESCRIPTION
1004
1005L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1006compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001007The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001008
1009
1010=head1 INSTALLATION
1011
1012The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1013
Akronaf386982016-10-12 00:33:25 +02001014 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001015
Akronc13a1702016-03-15 19:33:14 +01001016In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001017be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +02001018Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +02001019In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001020
1021=head1 ARGUMENTS
1022
Akrona76d8352016-10-27 16:27:32 +02001023 $ korapxml2krill -z --input <directory> --output <filename>
1024
1025Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001026It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001027
Akron941c1a62016-02-23 17:41:41 +01001028=over 2
1029
1030=item B<archive>
1031
Akron081639e2017-04-21 19:01:39 +02001032 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001033
Akron2fd402b2016-10-27 21:26:48 +02001034Converts an archive of KorAP-XML documents. It expects a directory
1035(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001036
1037=item B<extract>
1038
Akrona76d8352016-10-27 16:27:32 +02001039 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1040
1041Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001042
Akron63f20d42017-04-10 23:40:29 +02001043=item B<serial>
1044
1045 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1046
1047Convert archives sequentially. The inputs are not merged but treated
1048as they are (so they may be premerged or globs).
1049the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001050are created based on the archive name. In case the C<--to-tar> flag is given,
1051the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001052
1053
Akron941c1a62016-02-23 17:41:41 +01001054=back
1055
1056
1057=head1 OPTIONS
1058
1059=over 2
1060
Akrona76d8352016-10-27 16:27:32 +02001061=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001062
Akrona76d8352016-10-27 16:27:32 +02001063Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001064
Akron7606afa2016-10-25 16:23:49 +02001065Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001066document, while C<archive> expects a KorAP-XML corpus folder or a zip
1067file to batch process multiple files.
1068C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001069
Akrona76d8352016-10-27 16:27:32 +02001070C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001071that the first archive listed contains all primary data files
1072and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001073
Akron7606afa2016-10-25 16:23:49 +02001074 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001075
Akron821db3d2017-04-06 21:19:31 +02001076Input may also be defined using BSD glob wildcards.
1077
1078 -i 'file/news*.zip'
1079
1080The extended input array will be sorted in length order, so the shortest
1081path needs to contain all primary data files and all meta data files.
1082
Akron0c3e3752016-06-28 15:55:53 +02001083(The directory structure follows the base directory format,
1084that may include a C<.> root folder.
1085In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001086need to be passed with a hash sign in front of the archive's name.
1087This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001088
Akron7606afa2016-10-25 16:23:49 +02001089To support zip files, a version of C<unzip> needs to be installed that is
1090compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001091
Akron7606afa2016-10-25 16:23:49 +02001092B<The root folder switch using the hash sign is experimental and
1093may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001094
Akron63f20d42017-04-10 23:40:29 +02001095=item B<--input-base|-ib> <directory>
1096
1097The base directory for inputs.
1098
1099
Akron941c1a62016-02-23 17:41:41 +01001100=item B<--output|-o> <directory|file>
1101
1102Output folder for archive processing or
1103document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001104writes to C<STDOUT> by default
1105(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001106
1107=item B<--overwrite|-w>
1108
1109Overwrite files that already exist.
1110
Akron3741f8b2016-12-21 19:55:21 +01001111=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001112
1113Define the default tokenization by specifying
1114the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001115of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001116
Akron3741f8b2016-12-21 19:55:21 +01001117
1118=item B<--base-sentences|-bs> <foundry>#<layer>
1119
1120Define the layer for base sentences.
1121If given, this will be used instead of using C<Base#Sentences>.
1122Currently C<DeReKo#Structure> is the only additional layer supported.
1123
1124 Defaults to unset.
1125
1126
1127=item B<--base-paragraphs|-bp> <foundry>#<layer>
1128
1129Define the layer for base paragraphs.
1130If given, this will be used instead of using C<Base#Paragraphs>.
1131Currently C<DeReKo#Structure> is the only additional layer supported.
1132
1133 Defaults to unset.
1134
1135
Akron41ac10b2017-02-08 22:47:25 +01001136=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1137
1138Define the layer for base pagebreaks.
1139Currently C<DeReKo#Structure> is the only layer supported.
1140
1141 Defaults to unset.
1142
1143
Akron941c1a62016-02-23 17:41:41 +01001144=item B<--skip|-s> <foundry>[#<layer>]
1145
Akronf7ad89e2016-03-16 18:22:47 +01001146Skip specific annotations by specifying the foundry
1147(and optionally the layer with a C<#>-prefix),
1148e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001149Can be set multiple times.
1150
Akronc13a1702016-03-15 19:33:14 +01001151=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001152
Akronf7ad89e2016-03-16 18:22:47 +01001153Convert specific annotations by specifying the foundry
1154(and optionally the layer with a C<#>-prefix),
1155e.g. C<Mate> or C<Mate#Morpho>.
1156Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001157
1158=item B<--primary|-p>
1159
Akronc13a1702016-03-15 19:33:14 +01001160Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001161Can be flagged using C<--no-primary> as well.
1162This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001163
1164=item B<--jobs|-j>
1165
1166Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001167for archive processing.
Akron11c80302016-03-18 19:44:43 +01001168Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001169
1170If C<sequential-extraction> is not set to false, this will
1171also apply to extraction.
1172
Akronc11f7982017-02-21 21:20:14 +01001173Pass -1, and the value will be set automatically to 5
1174times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001175This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001176
Akron9ec88872017-04-12 16:29:06 +02001177=item B<--sequential-extraction|-se>
1178
1179Flag to indicate, if the C<jobs> value also applies to extraction.
1180Some systems may have problems with extracting multiple archives
1181to the same folder at the same time.
1182Can be flagged using C<--no-sequential-extraction> as well.
1183Defaults to C<false>.
1184
Akron35db6e32016-03-17 22:42:22 +01001185=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001186
Akron35db6e32016-03-17 22:42:22 +01001187Define the metadata parser to use. Defaults to C<I5>.
1188Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1189This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001190
1191=item B<--pretty|-y>
1192
Akronc13a1702016-03-15 19:33:14 +01001193Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001194This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001195
1196=item B<--gzip|-z>
1197
Akronf7ad89e2016-03-16 18:22:47 +01001198Compress the output.
1199Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001200
Akron11c80302016-03-18 19:44:43 +01001201=item B<--cache|-c>
1202
1203File to mmap a cache (using L<Cache::FastMmap>).
1204Defaults to C<korapxml2krill.cache> in the calling directory.
1205
1206=item B<--cache-size|-cs>
1207
1208Size of the cache. Defaults to C<50m>.
1209
1210=item B<--cache-init|-ci>
1211
1212Initialize cache file.
1213Can be flagged using C<--no-cache-init> as well.
1214Defaults to C<true>.
1215
1216=item B<--cache-delete|-cd>
1217
1218Delete cache file after processing.
1219Can be flagged using C<--no-cache-delete> as well.
1220Defaults to C<true>.
1221
Akron636aa112017-04-07 18:48:56 +02001222=item B<--config|-cfg>
1223
1224Configure the parameters of your call in a file
1225of key-value pairs with whitespace separator
1226
1227 overwrite 1
1228 token DeReKo#Structure
1229 ...
1230
1231Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001232C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001233C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron9ec88872017-04-12 16:29:06 +02001234C<output>,
1235C<temp-extract>, C<sequential-extraction>,
1236C<base-sentences>, C<base-paragraphs>,
1237C<base-pagebreaks>,
1238C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001239(semicolon separated), C<anno> (semicolon separated).
1240
Akron81500102017-04-07 20:45:44 +02001241=item B<--temporary-extract|-te>
1242
1243Only valid for the C<archive> command.
1244
1245This will first extract all files into a
1246directory and then will archive.
1247If the directory is given as C<:temp:>,
1248a temporary directory is used.
1249This is especially useful to avoid
1250massive unzipping and potential
1251network latency.
Akron636aa112017-04-07 18:48:56 +02001252
Akrone10ad322016-02-27 10:54:26 +01001253=item B<--sigle|-sg>
1254
Akron20807582016-10-26 17:11:34 +02001255Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001256Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001257I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001258Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001259In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001260On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001261
Akron941c1a62016-02-23 17:41:41 +01001262=item B<--log|-l>
1263
1264The L<Log4perl> log level, defaults to C<ERROR>.
1265
1266=item B<--help|-h>
1267
1268Print this document.
1269
1270=item B<--version|-v>
1271
1272Print version information.
1273
1274=back
1275
Akronc13a1702016-03-15 19:33:14 +01001276=head1 ANNOTATION SUPPORT
1277
1278L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1279developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1280The base foundry with paragraphs, sentences, and the text element are mandatory for
1281L<Krill|https://github.com/KorAP/Krill>.
1282
Akron821db3d2017-04-06 21:19:31 +02001283 Base
1284 #Paragraphs
1285 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001286
Akron821db3d2017-04-06 21:19:31 +02001287 Connexor
1288 #Morpho
1289 #Phrase
1290 #Sentences
1291 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001292
Akron821db3d2017-04-06 21:19:31 +02001293 CoreNLP
1294 #Constituency
1295 #Morpho
1296 #NamedEntities
1297 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001298
Akron821db3d2017-04-06 21:19:31 +02001299 DeReKo
1300 #Structure
Akronc13a1702016-03-15 19:33:14 +01001301
Akron821db3d2017-04-06 21:19:31 +02001302 DRuKoLa
1303 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001304
Akron821db3d2017-04-06 21:19:31 +02001305 Glemm
1306 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001307
Akron821db3d2017-04-06 21:19:31 +02001308 Malt
1309 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001310
Akron821db3d2017-04-06 21:19:31 +02001311 MarMoT
1312 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001313
Akron821db3d2017-04-06 21:19:31 +02001314 Mate
1315 #Dependency
1316 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001317
Akron821db3d2017-04-06 21:19:31 +02001318 MDParser
1319 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001320
Akron821db3d2017-04-06 21:19:31 +02001321 OpenNLP
1322 #Morpho
1323 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001324
Akron821db3d2017-04-06 21:19:31 +02001325 Sgbr
1326 #Lemma
1327 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001328
Akron821db3d2017-04-06 21:19:31 +02001329 TreeTagger
1330 #Morpho
1331 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001332
Akron821db3d2017-04-06 21:19:31 +02001333 XIP
1334 #Constituency
1335 #Morpho
1336 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001337
Akronc13a1702016-03-15 19:33:14 +01001338
1339More importers are in preparation.
1340New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1341See the built-in annotation importers as examples.
1342
Akron941c1a62016-02-23 17:41:41 +01001343=head1 AVAILABILITY
1344
1345 https://github.com/KorAP/KorAP-XML-Krill
1346
1347
1348=head1 COPYRIGHT AND LICENSE
1349
Akron3ec0a1c2017-01-18 14:41:55 +01001350Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001351
Akron941c1a62016-02-23 17:41:41 +01001352Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001353
Akrona76d8352016-10-27 16:27:32 +02001354Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001355
1356L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1357Corpus Analysis Platform at the
1358L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1359member of the
1360L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1361
1362This program is free software published under the
1363L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1364
1365=cut