blob: d00ba9e5683204029433408066dad7449990f8f8 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akron941c1a62016-02-23 17:41:41 +010073# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010074
Nils Diewald0e489772016-10-24 15:16:52 +020075our $LAST_CHANGE = '2016/10/24';
Akron941c1a62016-02-23 17:41:41 +010076our $LOCAL = $FindBin::Bin;
77our $VERSION_MSG = <<"VERSION";
78Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
79VERSION
80
Akron941c1a62016-02-23 17:41:41 +010081# Parse comand
82my $cmd;
83our @ARGV;
84if ($ARGV[0] && index($ARGV[0], '-') != 0) {
85 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010086};
Akron93d620e2016-02-05 19:40:05 +010087
Akron5f51d422016-08-16 16:26:43 +020088my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010089my $text;
Akrone10ad322016-02-27 10:54:26 +010090
Akron941c1a62016-02-23 17:41:41 +010091# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000092GetOptions(
Akron08385f62016-03-22 20:37:04 +010093 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010094 'output|o=s' => \(my $output),
95 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010096 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +020097 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron941c1a62016-02-23 17:41:41 +010098 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010099 'skip|s=s' => \@skip,
100 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100101 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100102 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200103 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100104 'primary|p!' => \(my $primary),
105 'pretty|y' => \(my $pretty),
106 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200107 'cache-size|cs=s' => \(my $cache_size = '50m'),
108 'cache-delete|cd!' => \(my $cache_delete = 1),
109 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100110 'help|h' => sub {
111 pod2usage(
112 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200113 -verbose => 99,
114 -msg => $VERSION_MSG,
115 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100116 );
117 },
118 'version|v' => sub {
119 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200120 -verbose => 0,
121 -msg => $VERSION_MSG,
122 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100123 )
124 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000125);
126
Akron941c1a62016-02-23 17:41:41 +0100127my %ERROR_HASH = (
128 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200129 -verbose => 99,
130 -msg => $VERSION_MSG,
131 -output => '-',
132 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100133);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000134
Akron941c1a62016-02-23 17:41:41 +0100135# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100136pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000137
Akrone1dbc382016-07-08 22:24:52 +0200138# Gzip has no effect, if no output is given
139pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000140
Akron941c1a62016-02-23 17:41:41 +0100141# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000142Log::Log4perl->init({
143 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
144 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
145 'log4perl.appender.STDERR.layout' => 'PatternLayout',
146 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
147});
148
149my $log = Log::Log4perl->get_logger('main');
150
Akrone1dbc382016-07-08 22:24:52 +0200151my %skip;
152$skip{lc($_)} = 1 foreach @skip;
153
154my @layers;
155push(@layers, ['Base', 'Sentences']);
156push(@layers, ['Base', 'Paragraphs']);
157
158# Connexor
159push(@layers, ['Connexor', 'Morpho']);
160push(@layers, ['Connexor', 'Syntax']);
161push(@layers, ['Connexor', 'Phrase']);
162push(@layers, ['Connexor', 'Sentences']);
163
164# CoreNLP
165push(@layers, ['CoreNLP', 'NamedEntities']);
166push(@layers, ['CoreNLP', 'Sentences']);
167push(@layers, ['CoreNLP', 'Morpho']);
168push(@layers, ['CoreNLP', 'Constituency']);
169
170# DeReKo
171push(@layers, ['DeReKo', 'Structure']);
172
173# Glemm
174push(@layers, ['Glemm', 'Morpho']);
175
176# Malt
177push(@layers, ['Malt', 'Dependency']);
178
179# MDParser
180push(@layers, ['MDParser', 'Dependency']);
181
182# Mate
183push(@layers, ['Mate', 'Morpho']);
184push(@layers, ['Mate', 'Dependency']);
185
186# OpenNLP
187push(@layers, ['OpenNLP', 'Morpho']);
188push(@layers, ['OpenNLP', 'Sentences']);
189
190# Schreibgebrauch
191push(@layers, ['Sgbr', 'Lemma']);
192push(@layers, ['Sgbr', 'Morpho']);
193
194# TreeTagger
195push(@layers, ['TreeTagger', 'Morpho']);
196push(@layers, ['TreeTagger', 'Sentences']);
197
198# XIP
199push(@layers, ['XIP', 'Morpho']);
200push(@layers, ['XIP', 'Constituency']);
201push(@layers, ['XIP', 'Sentences']);
202push(@layers, ['XIP', 'Dependency']);
203
204# Check filters
205my @filtered_anno;
206if ($skip{'#all'}) {
207 foreach (@anno) {
208 push @filtered_anno, [ split('#', $_) ];
209 };
210}
211
212# Add all annotations that are not skipped
213else {
214 # Add to index file - respect skipping
215 foreach my $info (@layers) {
216 # Skip if Foundry or Foundry#Layer should be skipped
217 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
218 push @filtered_anno, $info;
219 };
220 };
221};
222
223# Get tokenization basis
224my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
225
226# TODO: This should not be initialized for batch
227my $cache = Cache::FastMmap->new(
228 share_file => $cache_file,
229 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200230 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200231);
232
Akron03b24db2016-08-16 20:54:32 +0200233# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200234my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200235 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200236 meta_type => $meta,
237 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200238 foundry => $token_base_foundry,
239 layer => $token_base_layer,
240 gzip => $gzip,
241 log => $log,
242 primary => $primary,
243 pretty => $pretty,
244 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200245);
246
Akron941c1a62016-02-23 17:41:41 +0100247
248# Get file name based on path information
249sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100250 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200251 if (-d $i) {
252 $i =~ s![^\/]+$!!;
253 };
Akron941c1a62016-02-23 17:41:41 +0100254 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200255
256 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200257 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100258 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100259 $file =~ tr/\//-/;
260 $file =~ s{^-+}{};
261 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000262};
263
Akrone10ad322016-02-27 10:54:26 +0100264# Convert sigle to path construct
265s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
266
Akron7d4cdd82016-08-17 21:39:45 +0200267if ($cmd) {
268 if ($output && (!-e $output || !-d $output)) {
269 print "Directory '$output' does not exist.\n\n";
270 exit(0);
271 };
272};
273
274
Akron941c1a62016-02-23 17:41:41 +0100275# Process a single file
276unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100277 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000278
Akron941c1a62016-02-23 17:41:41 +0100279 BEGIN {
280 $main::TIME = Benchmark->new;
281 $main::LAST_STOP = Benchmark->new;
282 };
283
284 sub stop_time {
285 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200286 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100287 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200288 timestr(timediff($new, $main::LAST_STOP)) .
289 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
290 );
Akron941c1a62016-02-23 17:41:41 +0100291 $main::LAST_STOP = $new;
292 };
293
294 # Create and parse new document
295 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100296
Akron7d4cdd82016-08-17 21:39:45 +0200297 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200298 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100299
Akron11c80302016-03-18 19:44:43 +0100300 # Delete cache file
301 unlink($cache_file) if $cache_delete;
302
Akron5f51d422016-08-16 16:26:43 +0200303 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000304}
Nils Diewald59094f22014-11-05 18:20:50 +0000305
Akrone10ad322016-02-27 10:54:26 +0100306# Extract XML files
307elsif ($cmd eq 'extract') {
308
Akron7d4cdd82016-08-17 21:39:45 +0200309 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200310 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100311
Akron7d4cdd82016-08-17 21:39:45 +0200312 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100313 unless ($archive->test_unzip) {
314 print "Unzip is not installed or incompatible.\n\n";
315 exit(1);
316 };
317
Akronb0c88db2016-06-29 16:33:18 +0200318 # Add further annotation archived
319 $archive->attach($_) foreach @input;
320
Akron651cb8d2016-08-16 21:44:49 +0200321 my $prefix = 1;
322
Akron03b24db2016-08-16 20:54:32 +0200323 # No sigles given
324 unless (@sigle) {
325
326 # Get files
327 foreach ($archive->list_texts) {
328
329 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200330 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200331
332 # TODO: Make this OS independent
333 push @sigle, join '/', $corpus, $doc, $text;
334 };
Akron20807582016-10-26 17:11:34 +0200335 }
336
337 # Check sigle for doc sigles
338 else {
339 my @new_sigle;
340
341 my $prefix_check = 0;
342
343 # Iterate over all sigle
344 foreach (@sigle) {
345
346 # Sigle is a doc sigle
347 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
348 print "$_ ";
349
350 # Check if a prefix is needed
351 unless ($prefix_check) {
352 $prefix = $archive->check_prefix;
353 $prefix_check = 1;
354 };
355
356 # TODO: Make this OS independent
357 print '' . (
358 $archive->extract_doc(
359 ($prefix ? './' : '') . $_, $output
360 ) ? '' : 'not '
361 );
362 print "extracted.\n";
363 }
364 else {
365 push @new_sigle, $_;
366 };
367 };
368 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200369 };
370
Akrone10ad322016-02-27 10:54:26 +0100371 # Iterate over all given sigles and extract
372 foreach (@sigle) {
373 print "$_ ";
Akron7d4cdd82016-08-17 21:39:45 +0200374
Akron03b24db2016-08-16 20:54:32 +0200375 # TODO: Make this OS independent
Akron651cb8d2016-08-16 21:44:49 +0200376 print '' . (
Akron20807582016-10-26 17:11:34 +0200377 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200378 ($prefix ? './' : '') . $_, $output
379 ) ? '' : 'not '
380 );
Akrone10ad322016-02-27 10:54:26 +0100381 print "extracted.\n";
382 };
383
384 print "\n";
385 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200386 }
Akron7d4cdd82016-08-17 21:39:45 +0200387
388 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200389 else {
390 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100391 };
392}
393
Akron941c1a62016-02-23 17:41:41 +0100394# Process an archive
395elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000396
Akrone1dbc382016-07-08 22:24:52 +0200397 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100398
Akron7d4cdd82016-08-17 21:39:45 +0200399 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100400 my $pool = Parallel::ForkManager->new($jobs);
401
Akron7d4cdd82016-08-17 21:39:45 +0200402 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100403 my $iter = 1; # Current text in process
404
405 # Report on fork message
406 $pool->run_on_finish (
407 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200408 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100409 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200410
Akron08385f62016-03-22 20:37:04 +0100411 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200412 ($iter++) . "/$count]" .
413 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200414 ' ' . $data->[0] . "\n";
415 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100416 }
417 );
418
419 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200420 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100421 print "Reading data ...\n";
422
Akron7d4cdd82016-08-17 21:39:45 +0200423 # unless (Cache::FastMmap->new(
424 # share_file => $cache_file,
425 # cache_size => $cache_size,
426 # init_file => $cache_init
427 # )) {
428 # print "Unable to intialize cache '$cache_file'\n\n";
429 # exit(1);
430 # };
Akron11c80302016-03-18 19:44:43 +0100431
Akron941c1a62016-02-23 17:41:41 +0100432 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100433 if (-d $input[0]) {
434 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100435 my @dirs;
436 my $dir;
437
Akron7d4cdd82016-08-17 21:39:45 +0200438 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100439 while (1) {
440 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200441 push @dirs, $dir;
442 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100443 };
444 last unless $it->next;
445 };
446
447 print "Start processing ...\n";
448 $t = Benchmark->new;
449 $count = scalar @dirs;
450
451 DIRECTORY_LOOP:
452 for (my $i = 0; $i < $count; $i++) {
453
Akrone1dbc382016-07-08 22:24:52 +0200454 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200455 $output,
456 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200457 );
Akron941c1a62016-02-23 17:41:41 +0100458
459 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200460 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200461
462 if ($batch_file->process($dirs[$i] => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200463 $pool->finish(0, ["Processed " . $filename]);
Akron3ec48972016-08-17 23:24:52 +0200464 }
465 else {
Akron4c0cf312016-10-15 16:42:09 +0200466 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200467 };
Akron941c1a62016-02-23 17:41:41 +0100468 };
469 }
470
471 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200472 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200473
Akron941c1a62016-02-23 17:41:41 +0100474 unless ($archive->test_unzip) {
475 print "Unzip is not installed or incompatible.\n\n";
476 exit(1);
477 };
478
Akron08385f62016-03-22 20:37:04 +0100479 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200480 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100481
Akron941c1a62016-02-23 17:41:41 +0100482 print "Start processing ...\n";
483 $t = Benchmark->new;
484 my @dirs = $archive->list_texts;
485 $count = scalar @dirs;
486
487 ARCHIVE_LOOP:
488 for (my $i = 0; $i < $count; $i++) {
489
490 # Split path information
491 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
492
Akrone1dbc382016-07-08 22:24:52 +0200493 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200494 $output,
495 get_file_name(
496 catfile($corpus, $doc, $text)
497 . '.json' . ($gzip ? '.gz' : '')
498 )
Akrone1dbc382016-07-08 22:24:52 +0200499 );
Akron941c1a62016-02-23 17:41:41 +0100500
501 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200502 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100503
Akron4c0cf312016-10-15 16:42:09 +0200504 # Create temporary file
505 $temp = File::Temp->newdir;
506
Akronbdf434a2016-10-24 17:42:07 +0200507 # TODO: Check if $filename exist at the beginning,
508 # because extraction can be horrible slow!
509
Akron941c1a62016-02-23 17:41:41 +0100510 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200511 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100512
Akron7d4cdd82016-08-17 21:39:45 +0200513 # Create corpus directory
514 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100515
Akron7d4cdd82016-08-17 21:39:45 +0200516 # Temporary directory
517 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100518
Akron7d4cdd82016-08-17 21:39:45 +0200519 # Write file
520 if ($batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200521 # Delete temporary file
522 $pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200523 }
524 else {
Akron4c0cf312016-10-15 16:42:09 +0200525 # Delete temporary file
526 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200527 };
Akron941c1a62016-02-23 17:41:41 +0100528 }
Akron7d4cdd82016-08-17 21:39:45 +0200529
530 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100531 else {
Akron4c0cf312016-10-15 16:42:09 +0200532 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100533 };
534 };
535 }
536
537 else {
538 print "Input is neither a directory nor an archive.\n\n";
539 };
540
541 $pool->wait_all_children;
542
Akron11c80302016-03-18 19:44:43 +0100543 # Delete cache file
544 unlink($cache_file) if $cache_delete;
545
Akron941c1a62016-02-23 17:41:41 +0100546 print "Done.\n";
547 print timestr(timediff(Benchmark->new, $t))."\n\n";
548}
549
550# Unknown command
551else {
552 warn "Unknown command '$cmd'.\n\n";
553 pod2usage(%ERROR_HASH);
554}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000555
556__END__
Akron941c1a62016-02-23 17:41:41 +0100557
558=pod
559
560=encoding utf8
561
562=head1 NAME
563
Akronf7ad89e2016-03-16 18:22:47 +0100564korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100565
566
567=head1 SYNOPSIS
568
Akronc13a1702016-03-15 19:33:14 +0100569 $ korapxml2krill -z --input <directory> --output <filename>
Akron20807582016-10-26 17:11:34 +0200570 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Akron7606afa2016-10-25 16:23:49 +0200571 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
Akron941c1a62016-02-23 17:41:41 +0100572
573
574=head1 DESCRIPTION
575
576L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
577compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100578The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100579
580
581=head1 INSTALLATION
582
583The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
584
Akronaf386982016-10-12 00:33:25 +0200585 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100586
Akronc13a1702016-03-15 19:33:14 +0100587In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100588be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200589Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200590In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100591
592=head1 ARGUMENTS
593
Akron7606afa2016-10-25 16:23:49 +0200594Without arguments, C<korapxml2krill> processes a directory of a single KorAP-XML document.
595
Akron941c1a62016-02-23 17:41:41 +0100596=over 2
597
598=item B<archive>
599
Akron7606afa2016-10-25 16:23:49 +0200600Processes an archive as a Zip-file or a folder of KorAP-XML documents.
Akrone10ad322016-02-27 10:54:26 +0100601
602=item B<extract>
603
Akron7606afa2016-10-25 16:23:49 +0200604Extracts KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100605
606=back
607
608
609=head1 OPTIONS
610
611=over 2
612
Akron2cfe8092016-06-24 17:48:49 +0200613=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100614
Akronf7ad89e2016-03-16 18:22:47 +0100615Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100616
Akron7606afa2016-10-25 16:23:49 +0200617Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
618document, while C<archive> and C<extract> support zip archives as well.
619
620C<archive> supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200621that the first archive listed contains all primary data files
622and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200623
Akron7606afa2016-10-25 16:23:49 +0200624 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200625
Akron0c3e3752016-06-28 15:55:53 +0200626(The directory structure follows the base directory format,
627that may include a C<.> root folder.
628In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200629need to be passed with a hash sign in front of the archive's name.
630This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200631
Akron7606afa2016-10-25 16:23:49 +0200632To support zip files, a version of C<unzip> needs to be installed that is
633compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200634
Akron7606afa2016-10-25 16:23:49 +0200635B<The root folder switch using the hash sign is experimental and
636may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200637
Akron941c1a62016-02-23 17:41:41 +0100638=item B<--output|-o> <directory|file>
639
640Output folder for archive processing or
641document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100642writes to C<STDOUT> by default
643(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100644
645=item B<--overwrite|-w>
646
647Overwrite files that already exist.
648
649=item B<--token|-t> <foundry>[#<file>]
650
651Define the default tokenization by specifying
652the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100653of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100654
655=item B<--skip|-s> <foundry>[#<layer>]
656
Akronf7ad89e2016-03-16 18:22:47 +0100657Skip specific annotations by specifying the foundry
658(and optionally the layer with a C<#>-prefix),
659e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100660Can be set multiple times.
661
Akronc13a1702016-03-15 19:33:14 +0100662=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100663
Akronf7ad89e2016-03-16 18:22:47 +0100664Convert specific annotations by specifying the foundry
665(and optionally the layer with a C<#>-prefix),
666e.g. C<Mate> or C<Mate#Morpho>.
667Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100668
669=item B<--primary|-p>
670
Akronc13a1702016-03-15 19:33:14 +0100671Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100672Can be flagged using C<--no-primary> as well.
673This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100674
675=item B<--jobs|-j>
676
677Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100678for archive processing.
Akron11c80302016-03-18 19:44:43 +0100679Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100680This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100681
Akron35db6e32016-03-17 22:42:22 +0100682=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100683
Akron35db6e32016-03-17 22:42:22 +0100684Define the metadata parser to use. Defaults to C<I5>.
685Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
686This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100687
688=item B<--pretty|-y>
689
Akronc13a1702016-03-15 19:33:14 +0100690Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100691This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100692
693=item B<--gzip|-z>
694
Akronf7ad89e2016-03-16 18:22:47 +0100695Compress the output.
696Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100697
Akron11c80302016-03-18 19:44:43 +0100698=item B<--cache|-c>
699
700File to mmap a cache (using L<Cache::FastMmap>).
701Defaults to C<korapxml2krill.cache> in the calling directory.
702
703=item B<--cache-size|-cs>
704
705Size of the cache. Defaults to C<50m>.
706
707=item B<--cache-init|-ci>
708
709Initialize cache file.
710Can be flagged using C<--no-cache-init> as well.
711Defaults to C<true>.
712
713=item B<--cache-delete|-cd>
714
715Delete cache file after processing.
716Can be flagged using C<--no-cache-delete> as well.
717Defaults to C<true>.
718
Akrone10ad322016-02-27 10:54:26 +0100719=item B<--sigle|-sg>
720
Akron20807582016-10-26 17:11:34 +0200721Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100722Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100723I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200724Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200725In case the C<Text> path is omitted, the whole document will be extracted.
Akrone10ad322016-02-27 10:54:26 +0100726
Akron941c1a62016-02-23 17:41:41 +0100727=item B<--log|-l>
728
729The L<Log4perl> log level, defaults to C<ERROR>.
730
731=item B<--help|-h>
732
733Print this document.
734
735=item B<--version|-v>
736
737Print version information.
738
739=back
740
Akronc13a1702016-03-15 19:33:14 +0100741=head1 ANNOTATION SUPPORT
742
743L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
744developed in the KorAP project that are part of the KorAP preprocessing pipeline.
745The base foundry with paragraphs, sentences, and the text element are mandatory for
746L<Krill|https://github.com/KorAP/Krill>.
747
Akronf7ad89e2016-03-16 18:22:47 +0100748=over 2
Akronc13a1702016-03-15 19:33:14 +0100749
750=item B<Base>
751
752=over 4
753
Akronf7ad89e2016-03-16 18:22:47 +0100754=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100755
Akronf7ad89e2016-03-16 18:22:47 +0100756=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100757
758=back
759
760=item B<Connexor>
761
762=over 4
763
Akronf7ad89e2016-03-16 18:22:47 +0100764=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100765
Akronf7ad89e2016-03-16 18:22:47 +0100766=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100767
Akronf7ad89e2016-03-16 18:22:47 +0100768=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100769
Akronf7ad89e2016-03-16 18:22:47 +0100770=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100771
772=back
773
774=item B<CoreNLP>
775
776=over 4
777
Akronf7ad89e2016-03-16 18:22:47 +0100778=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100779
Akronf7ad89e2016-03-16 18:22:47 +0100780=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100781
Akronf7ad89e2016-03-16 18:22:47 +0100782=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100783
Akronf7ad89e2016-03-16 18:22:47 +0100784=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100785
786=back
787
788=item B<DeReKo>
789
790=over 4
791
Akronf7ad89e2016-03-16 18:22:47 +0100792=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100793
794=back
795
796=item B<Glemm>
797
798=over 4
799
Akronf7ad89e2016-03-16 18:22:47 +0100800=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100801
802=back
803
804=item B<Mate>
805
806=over 4
807
Akronf7ad89e2016-03-16 18:22:47 +0100808=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100809
Akronf7ad89e2016-03-16 18:22:47 +0100810=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100811
812=back
813
814=item B<OpenNLP>
815
816=over 4
817
Akronf7ad89e2016-03-16 18:22:47 +0100818=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100819
Akronf7ad89e2016-03-16 18:22:47 +0100820=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100821
822=back
823
824=item B<Sgbr>
825
826=over 4
827
Akronf7ad89e2016-03-16 18:22:47 +0100828=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100829
Akronf7ad89e2016-03-16 18:22:47 +0100830=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100831
832=back
833
834=item B<TreeTagger>
835
836=over 4
837
Akronf7ad89e2016-03-16 18:22:47 +0100838=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100839
Akronf7ad89e2016-03-16 18:22:47 +0100840=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100841
842=back
843
844=item B<XIP>
845
846=over 4
847
Akronf7ad89e2016-03-16 18:22:47 +0100848=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100849
Akronf7ad89e2016-03-16 18:22:47 +0100850=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100851
Akronf7ad89e2016-03-16 18:22:47 +0100852=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100853
854=back
855
856=back
857
858More importers are in preparation.
859New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
860See the built-in annotation importers as examples.
861
Akron941c1a62016-02-23 17:41:41 +0100862=head1 AVAILABILITY
863
864 https://github.com/KorAP/KorAP-XML-Krill
865
866
867=head1 COPYRIGHT AND LICENSE
868
869Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100870
Akron941c1a62016-02-23 17:41:41 +0100871Author: L<Nils Diewald|http://nils-diewald.de/>
872
873L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
874Corpus Analysis Platform at the
875L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
876member of the
877L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
878
879This program is free software published under the
880L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
881
882=cut