blob: 6a2a5556b6f0b86a18244c23b7f4decb95bed14b [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron941c1a62016-02-23 17:41:41 +010076# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010077
Nils Diewald0e489772016-10-24 15:16:52 +020078our $LAST_CHANGE = '2016/10/24';
Akron941c1a62016-02-23 17:41:41 +010079our $LOCAL = $FindBin::Bin;
80our $VERSION_MSG = <<"VERSION";
81Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
82VERSION
83
Akron941c1a62016-02-23 17:41:41 +010084# Parse comand
85my $cmd;
86our @ARGV;
87if ($ARGV[0] && index($ARGV[0], '-') != 0) {
88 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010089};
Akron93d620e2016-02-05 19:40:05 +010090
Akron5f51d422016-08-16 16:26:43 +020091my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010092my $text;
Akrone10ad322016-02-27 10:54:26 +010093
Akron941c1a62016-02-23 17:41:41 +010094# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000095GetOptions(
Akron08385f62016-03-22 20:37:04 +010096 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010097 'output|o=s' => \(my $output),
98 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010099 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200100 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron941c1a62016-02-23 17:41:41 +0100101 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100102 'skip|s=s' => \@skip,
103 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100104 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100105 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200106 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100107 'primary|p!' => \(my $primary),
108 'pretty|y' => \(my $pretty),
109 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200110 'cache-size|cs=s' => \(my $cache_size = '50m'),
111 'cache-delete|cd!' => \(my $cache_delete = 1),
112 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100113 'help|h' => sub {
114 pod2usage(
115 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200116 -verbose => 99,
117 -msg => $VERSION_MSG,
118 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100119 );
120 },
121 'version|v' => sub {
122 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200123 -verbose => 0,
124 -msg => $VERSION_MSG,
125 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100126 )
127 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000128);
129
Akron941c1a62016-02-23 17:41:41 +0100130my %ERROR_HASH = (
131 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200132 -verbose => 99,
133 -msg => $VERSION_MSG,
134 -output => '-',
135 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100136);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000137
Akron941c1a62016-02-23 17:41:41 +0100138# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100139pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000140
Akrone1dbc382016-07-08 22:24:52 +0200141# Gzip has no effect, if no output is given
142pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143
Akron941c1a62016-02-23 17:41:41 +0100144# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000145Log::Log4perl->init({
146 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
147 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
148 'log4perl.appender.STDERR.layout' => 'PatternLayout',
149 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
150});
151
152my $log = Log::Log4perl->get_logger('main');
153
Akrone1dbc382016-07-08 22:24:52 +0200154my %skip;
155$skip{lc($_)} = 1 foreach @skip;
156
157my @layers;
158push(@layers, ['Base', 'Sentences']);
159push(@layers, ['Base', 'Paragraphs']);
160
161# Connexor
162push(@layers, ['Connexor', 'Morpho']);
163push(@layers, ['Connexor', 'Syntax']);
164push(@layers, ['Connexor', 'Phrase']);
165push(@layers, ['Connexor', 'Sentences']);
166
167# CoreNLP
168push(@layers, ['CoreNLP', 'NamedEntities']);
169push(@layers, ['CoreNLP', 'Sentences']);
170push(@layers, ['CoreNLP', 'Morpho']);
171push(@layers, ['CoreNLP', 'Constituency']);
172
173# DeReKo
174push(@layers, ['DeReKo', 'Structure']);
175
176# Glemm
177push(@layers, ['Glemm', 'Morpho']);
178
179# Malt
180push(@layers, ['Malt', 'Dependency']);
181
182# MDParser
183push(@layers, ['MDParser', 'Dependency']);
184
185# Mate
186push(@layers, ['Mate', 'Morpho']);
187push(@layers, ['Mate', 'Dependency']);
188
189# OpenNLP
190push(@layers, ['OpenNLP', 'Morpho']);
191push(@layers, ['OpenNLP', 'Sentences']);
192
193# Schreibgebrauch
194push(@layers, ['Sgbr', 'Lemma']);
195push(@layers, ['Sgbr', 'Morpho']);
196
197# TreeTagger
198push(@layers, ['TreeTagger', 'Morpho']);
199push(@layers, ['TreeTagger', 'Sentences']);
200
201# XIP
202push(@layers, ['XIP', 'Morpho']);
203push(@layers, ['XIP', 'Constituency']);
204push(@layers, ['XIP', 'Sentences']);
205push(@layers, ['XIP', 'Dependency']);
206
207# Check filters
208my @filtered_anno;
209if ($skip{'#all'}) {
210 foreach (@anno) {
211 push @filtered_anno, [ split('#', $_) ];
212 };
213}
214
215# Add all annotations that are not skipped
216else {
217 # Add to index file - respect skipping
218 foreach my $info (@layers) {
219 # Skip if Foundry or Foundry#Layer should be skipped
220 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
221 push @filtered_anno, $info;
222 };
223 };
224};
225
226# Get tokenization basis
227my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
228
229# TODO: This should not be initialized for batch
230my $cache = Cache::FastMmap->new(
231 share_file => $cache_file,
232 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200233 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200234);
235
Akron03b24db2016-08-16 20:54:32 +0200236# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200237my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200238 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200239 meta_type => $meta,
240 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200241 foundry => $token_base_foundry,
242 layer => $token_base_layer,
243 gzip => $gzip,
244 log => $log,
245 primary => $primary,
246 pretty => $pretty,
247 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200248);
249
Akron941c1a62016-02-23 17:41:41 +0100250
251# Get file name based on path information
252sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100253 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200254 if (-d $i) {
255 $i =~ s![^\/]+$!!;
256 };
Akron941c1a62016-02-23 17:41:41 +0100257 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200258
259 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200260 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100261 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100262 $file =~ tr/\//-/;
263 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200264 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100265 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000266};
267
Akrone10ad322016-02-27 10:54:26 +0100268# Convert sigle to path construct
269s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
270
Akron7d4cdd82016-08-17 21:39:45 +0200271if ($cmd) {
272 if ($output && (!-e $output || !-d $output)) {
273 print "Directory '$output' does not exist.\n\n";
274 exit(0);
275 };
276};
277
278
Akron941c1a62016-02-23 17:41:41 +0100279# Process a single file
280unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100281 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000282
Akron941c1a62016-02-23 17:41:41 +0100283 BEGIN {
284 $main::TIME = Benchmark->new;
285 $main::LAST_STOP = Benchmark->new;
286 };
287
288 sub stop_time {
289 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200290 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100291 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200292 timestr(timediff($new, $main::LAST_STOP)) .
293 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
294 );
Akron941c1a62016-02-23 17:41:41 +0100295 $main::LAST_STOP = $new;
296 };
297
298 # Create and parse new document
299 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100300
Akron7d4cdd82016-08-17 21:39:45 +0200301 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200302 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100303
Akron11c80302016-03-18 19:44:43 +0100304 # Delete cache file
305 unlink($cache_file) if $cache_delete;
306
Akron5f51d422016-08-16 16:26:43 +0200307 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000308}
Nils Diewald59094f22014-11-05 18:20:50 +0000309
Akrone10ad322016-02-27 10:54:26 +0100310# Extract XML files
311elsif ($cmd eq 'extract') {
312
Akron7d4cdd82016-08-17 21:39:45 +0200313 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200314 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100315
Akron7d4cdd82016-08-17 21:39:45 +0200316 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100317 unless ($archive->test_unzip) {
318 print "Unzip is not installed or incompatible.\n\n";
319 exit(1);
320 };
321
Akronb0c88db2016-06-29 16:33:18 +0200322 # Add further annotation archived
323 $archive->attach($_) foreach @input;
324
Akron651cb8d2016-08-16 21:44:49 +0200325 my $prefix = 1;
326
Akron03b24db2016-08-16 20:54:32 +0200327 # No sigles given
328 unless (@sigle) {
329
330 # Get files
331 foreach ($archive->list_texts) {
332
333 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200334 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200335
336 # TODO: Make this OS independent
337 push @sigle, join '/', $corpus, $doc, $text;
338 };
Akron20807582016-10-26 17:11:34 +0200339 }
340
341 # Check sigle for doc sigles
342 else {
343 my @new_sigle;
344
345 my $prefix_check = 0;
346
347 # Iterate over all sigle
348 foreach (@sigle) {
349
350 # Sigle is a doc sigle
351 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
352 print "$_ ";
353
354 # Check if a prefix is needed
355 unless ($prefix_check) {
356 $prefix = $archive->check_prefix;
357 $prefix_check = 1;
358 };
359
360 # TODO: Make this OS independent
361 print '' . (
362 $archive->extract_doc(
363 ($prefix ? './' : '') . $_, $output
364 ) ? '' : 'not '
365 );
366 print "extracted.\n";
367 }
368 else {
369 push @new_sigle, $_;
370 };
371 };
372 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200373 };
374
Akrone10ad322016-02-27 10:54:26 +0100375 # Iterate over all given sigles and extract
376 foreach (@sigle) {
377 print "$_ ";
Akron7d4cdd82016-08-17 21:39:45 +0200378
Akron03b24db2016-08-16 20:54:32 +0200379 # TODO: Make this OS independent
Akron651cb8d2016-08-16 21:44:49 +0200380 print '' . (
Akron20807582016-10-26 17:11:34 +0200381 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200382 ($prefix ? './' : '') . $_, $output
383 ) ? '' : 'not '
384 );
Akrone10ad322016-02-27 10:54:26 +0100385 print "extracted.\n";
386 };
387
388 print "\n";
389 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200390 }
Akron7d4cdd82016-08-17 21:39:45 +0200391
392 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200393 else {
394 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100395 };
396}
397
Akron941c1a62016-02-23 17:41:41 +0100398# Process an archive
399elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000400
Akrone1dbc382016-07-08 22:24:52 +0200401 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100402
Akron7d4cdd82016-08-17 21:39:45 +0200403 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100404 my $pool = Parallel::ForkManager->new($jobs);
405
Akron7d4cdd82016-08-17 21:39:45 +0200406 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100407 my $iter = 1; # Current text in process
408
409 # Report on fork message
410 $pool->run_on_finish (
411 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200412 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100413 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200414
Akron08385f62016-03-22 20:37:04 +0100415 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200416 ($iter++) . "/$count]" .
417 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200418 ' ' . $data->[0] . "\n";
419 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100420 }
421 );
422
423 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200424 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100425 print "Reading data ...\n";
426
Akron7d4cdd82016-08-17 21:39:45 +0200427 # unless (Cache::FastMmap->new(
428 # share_file => $cache_file,
429 # cache_size => $cache_size,
430 # init_file => $cache_init
431 # )) {
432 # print "Unable to intialize cache '$cache_file'\n\n";
433 # exit(1);
434 # };
Akron11c80302016-03-18 19:44:43 +0100435
Akron941c1a62016-02-23 17:41:41 +0100436 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100437 if (-d $input[0]) {
438 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100439 my @dirs;
440 my $dir;
441
Akron7d4cdd82016-08-17 21:39:45 +0200442 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100443 while (1) {
444 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200445 push @dirs, $dir;
446 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100447 };
448 last unless $it->next;
449 };
450
451 print "Start processing ...\n";
452 $t = Benchmark->new;
453 $count = scalar @dirs;
454
455 DIRECTORY_LOOP:
456 for (my $i = 0; $i < $count; $i++) {
457
Akrone1dbc382016-07-08 22:24:52 +0200458 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200459 $output,
460 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200461 );
Akron941c1a62016-02-23 17:41:41 +0100462
463 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200464 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200465
466 if ($batch_file->process($dirs[$i] => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200467 $pool->finish(0, ["Processed " . $filename]);
Akron3ec48972016-08-17 23:24:52 +0200468 }
469 else {
Akron4c0cf312016-10-15 16:42:09 +0200470 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200471 };
Akron941c1a62016-02-23 17:41:41 +0100472 };
473 }
474
475 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200476 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200477
Akron941c1a62016-02-23 17:41:41 +0100478 unless ($archive->test_unzip) {
479 print "Unzip is not installed or incompatible.\n\n";
480 exit(1);
481 };
482
Akron08385f62016-03-22 20:37:04 +0100483 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200484 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100485
Akron941c1a62016-02-23 17:41:41 +0100486 print "Start processing ...\n";
487 $t = Benchmark->new;
488 my @dirs = $archive->list_texts;
489 $count = scalar @dirs;
490
491 ARCHIVE_LOOP:
492 for (my $i = 0; $i < $count; $i++) {
493
494 # Split path information
495 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
496
Akrone1dbc382016-07-08 22:24:52 +0200497 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200498 $output,
499 get_file_name(
500 catfile($corpus, $doc, $text)
501 . '.json' . ($gzip ? '.gz' : '')
502 )
Akrone1dbc382016-07-08 22:24:52 +0200503 );
Akron941c1a62016-02-23 17:41:41 +0100504
505 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200506 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100507
Akron4c0cf312016-10-15 16:42:09 +0200508 # Create temporary file
509 $temp = File::Temp->newdir;
510
Akronbdf434a2016-10-24 17:42:07 +0200511 # TODO: Check if $filename exist at the beginning,
512 # because extraction can be horrible slow!
513
Akron941c1a62016-02-23 17:41:41 +0100514 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200515 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100516
Akron7d4cdd82016-08-17 21:39:45 +0200517 # Create corpus directory
518 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100519
Akron7d4cdd82016-08-17 21:39:45 +0200520 # Temporary directory
521 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100522
Akron7d4cdd82016-08-17 21:39:45 +0200523 # Write file
524 if ($batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200525 # Delete temporary file
526 $pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200527 }
528 else {
Akron4c0cf312016-10-15 16:42:09 +0200529 # Delete temporary file
530 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200531 };
Akron941c1a62016-02-23 17:41:41 +0100532 }
Akron7d4cdd82016-08-17 21:39:45 +0200533
534 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100535 else {
Akron4c0cf312016-10-15 16:42:09 +0200536 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100537 };
538 };
539 }
540
541 else {
542 print "Input is neither a directory nor an archive.\n\n";
543 };
544
545 $pool->wait_all_children;
546
Akron11c80302016-03-18 19:44:43 +0100547 # Delete cache file
548 unlink($cache_file) if $cache_delete;
549
Akron941c1a62016-02-23 17:41:41 +0100550 print "Done.\n";
551 print timestr(timediff(Benchmark->new, $t))."\n\n";
552}
553
554# Unknown command
555else {
556 warn "Unknown command '$cmd'.\n\n";
557 pod2usage(%ERROR_HASH);
558}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000559
560__END__
Akron941c1a62016-02-23 17:41:41 +0100561
562=pod
563
564=encoding utf8
565
566=head1 NAME
567
Akronf7ad89e2016-03-16 18:22:47 +0100568korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100569
570
571=head1 SYNOPSIS
572
Akrona76d8352016-10-27 16:27:32 +0200573 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100574
575=head1 DESCRIPTION
576
577L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
578compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100579The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100580
581
582=head1 INSTALLATION
583
584The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
585
Akronaf386982016-10-12 00:33:25 +0200586 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100587
Akronc13a1702016-03-15 19:33:14 +0100588In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100589be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200590Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200591In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100592
593=head1 ARGUMENTS
594
Akrona76d8352016-10-27 16:27:32 +0200595 $ korapxml2krill -z --input <directory> --output <filename>
596
597Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
598Expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200599
Akron941c1a62016-02-23 17:41:41 +0100600=over 2
601
602=item B<archive>
603
Akrona76d8352016-10-27 16:27:32 +0200604 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
605
606Converts an archive of KorAP-XML documents. Expects a directory
607(pointing to the text level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100608
609=item B<extract>
610
Akrona76d8352016-10-27 16:27:32 +0200611 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
612
613Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100614
615=back
616
617
618=head1 OPTIONS
619
620=over 2
621
Akrona76d8352016-10-27 16:27:32 +0200622=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100623
Akrona76d8352016-10-27 16:27:32 +0200624Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100625
Akron7606afa2016-10-25 16:23:49 +0200626Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akrona76d8352016-10-27 16:27:32 +0200627document, while C<archive> and C<extract> support zip files as well.
Akron7606afa2016-10-25 16:23:49 +0200628
Akrona76d8352016-10-27 16:27:32 +0200629C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200630that the first archive listed contains all primary data files
631and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200632
Akron7606afa2016-10-25 16:23:49 +0200633 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200634
Akron0c3e3752016-06-28 15:55:53 +0200635(The directory structure follows the base directory format,
636that may include a C<.> root folder.
637In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200638need to be passed with a hash sign in front of the archive's name.
639This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200640
Akron7606afa2016-10-25 16:23:49 +0200641To support zip files, a version of C<unzip> needs to be installed that is
642compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200643
Akron7606afa2016-10-25 16:23:49 +0200644B<The root folder switch using the hash sign is experimental and
645may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200646
Akron941c1a62016-02-23 17:41:41 +0100647=item B<--output|-o> <directory|file>
648
649Output folder for archive processing or
650document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100651writes to C<STDOUT> by default
652(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100653
654=item B<--overwrite|-w>
655
656Overwrite files that already exist.
657
658=item B<--token|-t> <foundry>[#<file>]
659
660Define the default tokenization by specifying
661the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100662of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100663
664=item B<--skip|-s> <foundry>[#<layer>]
665
Akronf7ad89e2016-03-16 18:22:47 +0100666Skip specific annotations by specifying the foundry
667(and optionally the layer with a C<#>-prefix),
668e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100669Can be set multiple times.
670
Akronc13a1702016-03-15 19:33:14 +0100671=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100672
Akronf7ad89e2016-03-16 18:22:47 +0100673Convert specific annotations by specifying the foundry
674(and optionally the layer with a C<#>-prefix),
675e.g. C<Mate> or C<Mate#Morpho>.
676Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100677
678=item B<--primary|-p>
679
Akronc13a1702016-03-15 19:33:14 +0100680Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100681Can be flagged using C<--no-primary> as well.
682This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100683
684=item B<--jobs|-j>
685
686Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100687for archive processing.
Akron11c80302016-03-18 19:44:43 +0100688Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100689This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100690
Akron35db6e32016-03-17 22:42:22 +0100691=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100692
Akron35db6e32016-03-17 22:42:22 +0100693Define the metadata parser to use. Defaults to C<I5>.
694Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
695This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100696
697=item B<--pretty|-y>
698
Akronc13a1702016-03-15 19:33:14 +0100699Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100700This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100701
702=item B<--gzip|-z>
703
Akronf7ad89e2016-03-16 18:22:47 +0100704Compress the output.
705Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100706
Akron11c80302016-03-18 19:44:43 +0100707=item B<--cache|-c>
708
709File to mmap a cache (using L<Cache::FastMmap>).
710Defaults to C<korapxml2krill.cache> in the calling directory.
711
712=item B<--cache-size|-cs>
713
714Size of the cache. Defaults to C<50m>.
715
716=item B<--cache-init|-ci>
717
718Initialize cache file.
719Can be flagged using C<--no-cache-init> as well.
720Defaults to C<true>.
721
722=item B<--cache-delete|-cd>
723
724Delete cache file after processing.
725Can be flagged using C<--no-cache-delete> as well.
726Defaults to C<true>.
727
Akrone10ad322016-02-27 10:54:26 +0100728=item B<--sigle|-sg>
729
Akron20807582016-10-26 17:11:34 +0200730Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100731Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100732I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200733Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200734In case the C<Text> path is omitted, the whole document will be extracted.
Akrone10ad322016-02-27 10:54:26 +0100735
Akron941c1a62016-02-23 17:41:41 +0100736=item B<--log|-l>
737
738The L<Log4perl> log level, defaults to C<ERROR>.
739
740=item B<--help|-h>
741
742Print this document.
743
744=item B<--version|-v>
745
746Print version information.
747
748=back
749
Akronc13a1702016-03-15 19:33:14 +0100750=head1 ANNOTATION SUPPORT
751
752L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
753developed in the KorAP project that are part of the KorAP preprocessing pipeline.
754The base foundry with paragraphs, sentences, and the text element are mandatory for
755L<Krill|https://github.com/KorAP/Krill>.
756
Akronf7ad89e2016-03-16 18:22:47 +0100757=over 2
Akronc13a1702016-03-15 19:33:14 +0100758
759=item B<Base>
760
761=over 4
762
Akronf7ad89e2016-03-16 18:22:47 +0100763=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100764
Akronf7ad89e2016-03-16 18:22:47 +0100765=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100766
767=back
768
769=item B<Connexor>
770
771=over 4
772
Akronf7ad89e2016-03-16 18:22:47 +0100773=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100774
Akronf7ad89e2016-03-16 18:22:47 +0100775=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100776
Akronf7ad89e2016-03-16 18:22:47 +0100777=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100778
Akronf7ad89e2016-03-16 18:22:47 +0100779=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100780
781=back
782
783=item B<CoreNLP>
784
785=over 4
786
Akronf7ad89e2016-03-16 18:22:47 +0100787=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100788
Akronf7ad89e2016-03-16 18:22:47 +0100789=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100790
Akronf7ad89e2016-03-16 18:22:47 +0100791=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100792
Akronf7ad89e2016-03-16 18:22:47 +0100793=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100794
795=back
796
797=item B<DeReKo>
798
799=over 4
800
Akronf7ad89e2016-03-16 18:22:47 +0100801=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100802
803=back
804
805=item B<Glemm>
806
807=over 4
808
Akronf7ad89e2016-03-16 18:22:47 +0100809=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100810
811=back
812
813=item B<Mate>
814
815=over 4
816
Akronf7ad89e2016-03-16 18:22:47 +0100817=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100818
Akronf7ad89e2016-03-16 18:22:47 +0100819=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100820
821=back
822
823=item B<OpenNLP>
824
825=over 4
826
Akronf7ad89e2016-03-16 18:22:47 +0100827=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100828
Akronf7ad89e2016-03-16 18:22:47 +0100829=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100830
831=back
832
833=item B<Sgbr>
834
835=over 4
836
Akronf7ad89e2016-03-16 18:22:47 +0100837=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100838
Akronf7ad89e2016-03-16 18:22:47 +0100839=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100840
841=back
842
843=item B<TreeTagger>
844
845=over 4
846
Akronf7ad89e2016-03-16 18:22:47 +0100847=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100848
Akronf7ad89e2016-03-16 18:22:47 +0100849=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100850
851=back
852
853=item B<XIP>
854
855=over 4
856
Akronf7ad89e2016-03-16 18:22:47 +0100857=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100858
Akronf7ad89e2016-03-16 18:22:47 +0100859=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100860
Akronf7ad89e2016-03-16 18:22:47 +0100861=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100862
863=back
864
865=back
866
867More importers are in preparation.
868New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
869See the built-in annotation importers as examples.
870
Akron941c1a62016-02-23 17:41:41 +0100871=head1 AVAILABILITY
872
873 https://github.com/KorAP/KorAP-XML-Krill
874
875
876=head1 COPYRIGHT AND LICENSE
877
878Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100879
Akron941c1a62016-02-23 17:41:41 +0100880Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200881Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100882
883L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
884Corpus Analysis Platform at the
885L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
886member of the
887L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
888
889This program is free software published under the
890L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
891
892=cut