blob: 939dcd4fe0ac5ef7ada6fc981b6dc3b359642806 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron941c1a62016-02-23 17:41:41 +010066# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010067
Akron5f51d422016-08-16 16:26:43 +020068our $LAST_CHANGE = '2016/08/16';
Akron941c1a62016-02-23 17:41:41 +010069our $LOCAL = $FindBin::Bin;
70our $VERSION_MSG = <<"VERSION";
71Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
72VERSION
73
Akron941c1a62016-02-23 17:41:41 +010074# Parse comand
75my $cmd;
76our @ARGV;
77if ($ARGV[0] && index($ARGV[0], '-') != 0) {
78 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010079};
Akron93d620e2016-02-05 19:40:05 +010080
Akron5f51d422016-08-16 16:26:43 +020081my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010082my $text;
Akrone10ad322016-02-27 10:54:26 +010083
Akron941c1a62016-02-23 17:41:41 +010084# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000085GetOptions(
Akron08385f62016-03-22 20:37:04 +010086 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010087 'output|o=s' => \(my $output),
88 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010089 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010090 'token|t=s' => \(my $token_base),
91 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010092 'skip|s=s' => \@skip,
93 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010094 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +010095 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +020096 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +010097 'primary|p!' => \(my $primary),
98 'pretty|y' => \(my $pretty),
99 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200100 'cache-size|cs=s' => \(my $cache_size = '50m'),
101 'cache-delete|cd!' => \(my $cache_delete = 1),
102 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100103 'help|h' => sub {
104 pod2usage(
105 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200106 -verbose => 99,
107 -msg => $VERSION_MSG,
108 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100109 );
110 },
111 'version|v' => sub {
112 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200113 -verbose => 0,
114 -msg => $VERSION_MSG,
115 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100116 )
117 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000118);
119
Akron941c1a62016-02-23 17:41:41 +0100120my %ERROR_HASH = (
121 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200122 -verbose => 99,
123 -msg => $VERSION_MSG,
124 -output => '-',
125 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100126);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127
Akron941c1a62016-02-23 17:41:41 +0100128# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100129pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000130
Akrone1dbc382016-07-08 22:24:52 +0200131# Gzip has no effect, if no output is given
132pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000133
Akron941c1a62016-02-23 17:41:41 +0100134# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000135Log::Log4perl->init({
136 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
137 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
138 'log4perl.appender.STDERR.layout' => 'PatternLayout',
139 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
140});
141
142my $log = Log::Log4perl->get_logger('main');
143
Akrone1dbc382016-07-08 22:24:52 +0200144my %skip;
145$skip{lc($_)} = 1 foreach @skip;
146
147my @layers;
148push(@layers, ['Base', 'Sentences']);
149push(@layers, ['Base', 'Paragraphs']);
150
151# Connexor
152push(@layers, ['Connexor', 'Morpho']);
153push(@layers, ['Connexor', 'Syntax']);
154push(@layers, ['Connexor', 'Phrase']);
155push(@layers, ['Connexor', 'Sentences']);
156
157# CoreNLP
158push(@layers, ['CoreNLP', 'NamedEntities']);
159push(@layers, ['CoreNLP', 'Sentences']);
160push(@layers, ['CoreNLP', 'Morpho']);
161push(@layers, ['CoreNLP', 'Constituency']);
162
163# DeReKo
164push(@layers, ['DeReKo', 'Structure']);
165
166# Glemm
167push(@layers, ['Glemm', 'Morpho']);
168
169# Malt
170push(@layers, ['Malt', 'Dependency']);
171
172# MDParser
173push(@layers, ['MDParser', 'Dependency']);
174
175# Mate
176push(@layers, ['Mate', 'Morpho']);
177push(@layers, ['Mate', 'Dependency']);
178
179# OpenNLP
180push(@layers, ['OpenNLP', 'Morpho']);
181push(@layers, ['OpenNLP', 'Sentences']);
182
183# Schreibgebrauch
184push(@layers, ['Sgbr', 'Lemma']);
185push(@layers, ['Sgbr', 'Morpho']);
186
187# TreeTagger
188push(@layers, ['TreeTagger', 'Morpho']);
189push(@layers, ['TreeTagger', 'Sentences']);
190
191# XIP
192push(@layers, ['XIP', 'Morpho']);
193push(@layers, ['XIP', 'Constituency']);
194push(@layers, ['XIP', 'Sentences']);
195push(@layers, ['XIP', 'Dependency']);
196
197# Check filters
198my @filtered_anno;
199if ($skip{'#all'}) {
200 foreach (@anno) {
201 push @filtered_anno, [ split('#', $_) ];
202 };
203}
204
205# Add all annotations that are not skipped
206else {
207 # Add to index file - respect skipping
208 foreach my $info (@layers) {
209 # Skip if Foundry or Foundry#Layer should be skipped
210 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
211 push @filtered_anno, $info;
212 };
213 };
214};
215
216# Get tokenization basis
217my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
218
219# TODO: This should not be initialized for batch
220my $cache = Cache::FastMmap->new(
221 share_file => $cache_file,
222 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200223 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200224);
225
Akron03b24db2016-08-16 20:54:32 +0200226# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200227my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200228 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200229 meta_type => $meta,
230 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200231 foundry => $token_base_foundry,
232 layer => $token_base_layer,
233 gzip => $gzip,
234 log => $log,
235 primary => $primary,
236 pretty => $pretty,
237 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200238);
239
Akron941c1a62016-02-23 17:41:41 +0100240
241# Get file name based on path information
242sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100243 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100244 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200245 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100246 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100247 $file =~ tr/\//-/;
248 $file =~ s{^-+}{};
249 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000250};
251
Akron941c1a62016-02-23 17:41:41 +0100252
253# Write file
Akrone1dbc382016-07-08 22:24:52 +0200254#sub write_file {
255# my $anno = shift;
256# my $file = get_file_name $anno;
257#
258# # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
259#
260# my $call = 'perl ' . $LOCAL . '/korapxml2krill';
261# $call .= ' -i ' . $anno;
262# $call .= ' -o ' . $output . '/' . $file . '.json';
263# $call .= '.gz -z' if $gzip;
264# $call .= ' -m ' . $meta if $meta;
265# $call .= ' -w' if $overwrite;
266# $call .= ' -t ' . $token_base if $token_base;
267# $call .= ' -l ' . $log_level if $log_level;
268# $call .= ' -c ' . $cache_file;
269# $call .= ' -cs ' . $cache_size;
270# $call .= ' --no-cache-delete'; # Don't delete the cache
271# $call .= ' --no-cache-init'; # Don't initialize the cache
272# $call .= ' --no-primary ' if $primary;
273# $call .= ' -y ' . $pretty if $pretty;
274# $call .= ' -a ' . $_ foreach @anno;
275# $call .= ' -s ' . $_ foreach @skip;
276# system($call);
277# return "$file";
278#};
Nils Diewald7364d1f2013-11-05 19:26:35 +0000279
Nils Diewald2db9ad02013-10-29 19:26:43 +0000280
Akrone10ad322016-02-27 10:54:26 +0100281# Convert sigle to path construct
282s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
283
Akron7d4cdd82016-08-17 21:39:45 +0200284if ($cmd) {
285 if ($output && (!-e $output || !-d $output)) {
286 print "Directory '$output' does not exist.\n\n";
287 exit(0);
288 };
289};
290
291
Akron941c1a62016-02-23 17:41:41 +0100292# Process a single file
293unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100294 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000295
Akron941c1a62016-02-23 17:41:41 +0100296 BEGIN {
297 $main::TIME = Benchmark->new;
298 $main::LAST_STOP = Benchmark->new;
299 };
300
301 sub stop_time {
302 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200303 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100304 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200305 timestr(timediff($new, $main::LAST_STOP)) .
306 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
307 );
Akron941c1a62016-02-23 17:41:41 +0100308 $main::LAST_STOP = $new;
309 };
310
311 # Create and parse new document
312 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100313
Akron7d4cdd82016-08-17 21:39:45 +0200314 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200315 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100316
Akron11c80302016-03-18 19:44:43 +0100317 # Delete cache file
318 unlink($cache_file) if $cache_delete;
319
Akron5f51d422016-08-16 16:26:43 +0200320 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000321}
Nils Diewald59094f22014-11-05 18:20:50 +0000322
Akrone10ad322016-02-27 10:54:26 +0100323# Extract XML files
324elsif ($cmd eq 'extract') {
325
Akron7d4cdd82016-08-17 21:39:45 +0200326 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200327 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100328
Akron7d4cdd82016-08-17 21:39:45 +0200329 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100330 unless ($archive->test_unzip) {
331 print "Unzip is not installed or incompatible.\n\n";
332 exit(1);
333 };
334
Akronb0c88db2016-06-29 16:33:18 +0200335 # Add further annotation archived
336 $archive->attach($_) foreach @input;
337
Akron651cb8d2016-08-16 21:44:49 +0200338 my $prefix = 1;
339
Akron03b24db2016-08-16 20:54:32 +0200340 # No sigles given
341 unless (@sigle) {
342
343 # Get files
344 foreach ($archive->list_texts) {
345
346 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200347 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200348
349 # TODO: Make this OS independent
350 push @sigle, join '/', $corpus, $doc, $text;
351 };
352 };
353
Akrone10ad322016-02-27 10:54:26 +0100354 # Iterate over all given sigles and extract
355 foreach (@sigle) {
356 print "$_ ";
Akron7d4cdd82016-08-17 21:39:45 +0200357
Akron03b24db2016-08-16 20:54:32 +0200358 # TODO: Make this OS independent
Akron651cb8d2016-08-16 21:44:49 +0200359 print '' . (
360 $archive->extract(
361 ($prefix ? './' : '') . $_, $output
362 ) ? '' : 'not '
363 );
Akrone10ad322016-02-27 10:54:26 +0100364 print "extracted.\n";
365 };
366
367 print "\n";
368 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200369 }
Akron7d4cdd82016-08-17 21:39:45 +0200370
371 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200372 else {
373 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100374 };
375}
376
Akron941c1a62016-02-23 17:41:41 +0100377# Process an archive
378elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000379
Akrone1dbc382016-07-08 22:24:52 +0200380 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100381
Akron7d4cdd82016-08-17 21:39:45 +0200382 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100383 my $pool = Parallel::ForkManager->new($jobs);
384
Akron7d4cdd82016-08-17 21:39:45 +0200385 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100386 my $iter = 1; # Current text in process
387
388 # Report on fork message
389 $pool->run_on_finish (
390 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200391 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100392 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200393
Akron08385f62016-03-22 20:37:04 +0100394 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200395 ($iter++) . "/$count]" .
396 ($code ? " $code" : '') .
397 " $$data\n";
Akron941c1a62016-02-23 17:41:41 +0100398 }
399 );
400
401 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200402 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100403 print "Reading data ...\n";
404
Akron7d4cdd82016-08-17 21:39:45 +0200405 # unless (Cache::FastMmap->new(
406 # share_file => $cache_file,
407 # cache_size => $cache_size,
408 # init_file => $cache_init
409 # )) {
410 # print "Unable to intialize cache '$cache_file'\n\n";
411 # exit(1);
412 # };
Akron11c80302016-03-18 19:44:43 +0100413
Akron941c1a62016-02-23 17:41:41 +0100414 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100415 if (-d $input[0]) {
416 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100417 my @dirs;
418 my $dir;
419
Akron7d4cdd82016-08-17 21:39:45 +0200420 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100421 while (1) {
422 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200423 push @dirs, $dir;
424 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100425 };
426 last unless $it->next;
427 };
428
429 print "Start processing ...\n";
430 $t = Benchmark->new;
431 $count = scalar @dirs;
432
433 DIRECTORY_LOOP:
434 for (my $i = 0; $i < $count; $i++) {
435
Akrone1dbc382016-07-08 22:24:52 +0200436 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200437 $output,
438 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200439 );
Akron941c1a62016-02-23 17:41:41 +0100440
441 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200442 $pool->start and next DIRECTORY_LOOP;
443 my $msg = $batch_file->process($dirs[$i] => $filename);
Akron941c1a62016-02-23 17:41:41 +0100444 $pool->finish(0, \$msg);
445 };
446 }
447
448 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200449 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200450
Akron941c1a62016-02-23 17:41:41 +0100451 unless ($archive->test_unzip) {
452 print "Unzip is not installed or incompatible.\n\n";
453 exit(1);
454 };
455
Akron08385f62016-03-22 20:37:04 +0100456 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200457 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100458
Akron941c1a62016-02-23 17:41:41 +0100459 print "Start processing ...\n";
460 $t = Benchmark->new;
461 my @dirs = $archive->list_texts;
462 $count = scalar @dirs;
463
Akron7d4cdd82016-08-17 21:39:45 +0200464 # Create temporary file
465 $temp = File::Temp->newdir;
466
Akron941c1a62016-02-23 17:41:41 +0100467 ARCHIVE_LOOP:
468 for (my $i = 0; $i < $count; $i++) {
469
470 # Split path information
471 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
472
Akrone1dbc382016-07-08 22:24:52 +0200473 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200474 $output,
475 get_file_name(
476 catfile($corpus, $doc, $text)
477 . '.json' . ($gzip ? '.gz' : '')
478 )
Akrone1dbc382016-07-08 22:24:52 +0200479 );
Akron941c1a62016-02-23 17:41:41 +0100480
481 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200482 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100483
484 my $msg;
485
486 # Extract from archive
487 if ($archive->extract($dirs[$i], $temp)) {
488
Akron7d4cdd82016-08-17 21:39:45 +0200489 # Create corpus directory
490 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100491
Akron7d4cdd82016-08-17 21:39:45 +0200492 # Temporary directory
493 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100494
Akron7d4cdd82016-08-17 21:39:45 +0200495 # Write file
496 if ($batch_file->process($dir => $filename)) {
497 $pool->finish(0, \("Processed " . $filename));
498 }
499 else {
500 $pool->finish(1, \("Unable to process " . $dir));
501 };
Akron941c1a62016-02-23 17:41:41 +0100502 }
Akron7d4cdd82016-08-17 21:39:45 +0200503
504 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100505 else {
506
Akron7d4cdd82016-08-17 21:39:45 +0200507 $msg = "Unable to extract " . $dirs[$i] . "\n";
508 $pool->finish(1, \$msg);
Akron941c1a62016-02-23 17:41:41 +0100509 };
510 };
511 }
512
513 else {
514 print "Input is neither a directory nor an archive.\n\n";
515 };
516
517 $pool->wait_all_children;
518
Akron7d4cdd82016-08-17 21:39:45 +0200519 # Delete temporary file
520 $temp = undef;
521
Akron11c80302016-03-18 19:44:43 +0100522 # Delete cache file
523 unlink($cache_file) if $cache_delete;
524
Akron941c1a62016-02-23 17:41:41 +0100525 print "Done.\n";
526 print timestr(timediff(Benchmark->new, $t))."\n\n";
527}
528
529# Unknown command
530else {
531 warn "Unknown command '$cmd'.\n\n";
532 pod2usage(%ERROR_HASH);
533}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000534
535__END__
Akron941c1a62016-02-23 17:41:41 +0100536
537=pod
538
539=encoding utf8
540
541=head1 NAME
542
Akronf7ad89e2016-03-16 18:22:47 +0100543korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100544
545
546=head1 SYNOPSIS
547
Akronc13a1702016-03-15 19:33:14 +0100548 $ korapxml2krill -z --input <directory> --output <filename>
549 $ korapxml2krill archive -z --input <directory> --output <directory>
550 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100551
552
553=head1 DESCRIPTION
554
555L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
556compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100557The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100558
559
560=head1 INSTALLATION
561
562The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
563
564 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
565
Akronc13a1702016-03-15 19:33:14 +0100566In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100567be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100568
569
570=head1 ARGUMENTS
571
572=over 2
573
574=item B<archive>
575
Akrone10ad322016-02-27 10:54:26 +0100576Process an archive as a Zip-file or a folder of KorAP-XML documents.
577
578=item B<extract>
579
580Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100581
582=back
583
584
585=head1 OPTIONS
586
587=over 2
588
Akron2cfe8092016-06-24 17:48:49 +0200589=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100590
Akronf7ad89e2016-03-16 18:22:47 +0100591Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100592
Akron0c3e3752016-06-28 15:55:53 +0200593Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200594that the first archive listed contains all primary data files
595and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200596
597 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
598
Akron0c3e3752016-06-28 15:55:53 +0200599(The directory structure follows the base directory format,
600that may include a C<.> root folder.
601In this case further archives lacking a C<.> root folder
602need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200603
Akron651cb8d2016-08-16 21:44:49 +0200604B<The root folder switch is experimental and may vanish in future versions.>
605
Akron941c1a62016-02-23 17:41:41 +0100606=item B<--output|-o> <directory|file>
607
608Output folder for archive processing or
609document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100610writes to C<STDOUT> by default
611(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100612
613=item B<--overwrite|-w>
614
615Overwrite files that already exist.
616
617=item B<--token|-t> <foundry>[#<file>]
618
619Define the default tokenization by specifying
620the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100621of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100622
623=item B<--skip|-s> <foundry>[#<layer>]
624
Akronf7ad89e2016-03-16 18:22:47 +0100625Skip specific annotations by specifying the foundry
626(and optionally the layer with a C<#>-prefix),
627e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100628Can be set multiple times.
629
Akronc13a1702016-03-15 19:33:14 +0100630=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100631
Akronf7ad89e2016-03-16 18:22:47 +0100632Convert specific annotations by specifying the foundry
633(and optionally the layer with a C<#>-prefix),
634e.g. C<Mate> or C<Mate#Morpho>.
635Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100636
637=item B<--primary|-p>
638
Akronc13a1702016-03-15 19:33:14 +0100639Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100640Can be flagged using C<--no-primary> as well.
641This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100642
643=item B<--jobs|-j>
644
645Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100646for archive processing.
Akron11c80302016-03-18 19:44:43 +0100647Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100648This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100649
Akron35db6e32016-03-17 22:42:22 +0100650=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100651
Akron35db6e32016-03-17 22:42:22 +0100652Define the metadata parser to use. Defaults to C<I5>.
653Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
654This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100655
656=item B<--pretty|-y>
657
Akronc13a1702016-03-15 19:33:14 +0100658Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100659This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100660
661=item B<--gzip|-z>
662
Akronf7ad89e2016-03-16 18:22:47 +0100663Compress the output.
664Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100665
Akron11c80302016-03-18 19:44:43 +0100666=item B<--cache|-c>
667
668File to mmap a cache (using L<Cache::FastMmap>).
669Defaults to C<korapxml2krill.cache> in the calling directory.
670
671=item B<--cache-size|-cs>
672
673Size of the cache. Defaults to C<50m>.
674
675=item B<--cache-init|-ci>
676
677Initialize cache file.
678Can be flagged using C<--no-cache-init> as well.
679Defaults to C<true>.
680
681=item B<--cache-delete|-cd>
682
683Delete cache file after processing.
684Can be flagged using C<--no-cache-delete> as well.
685Defaults to C<true>.
686
Akrone10ad322016-02-27 10:54:26 +0100687=item B<--sigle|-sg>
688
689Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100690Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100691I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200692Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100693
Akron941c1a62016-02-23 17:41:41 +0100694=item B<--log|-l>
695
696The L<Log4perl> log level, defaults to C<ERROR>.
697
698=item B<--help|-h>
699
700Print this document.
701
702=item B<--version|-v>
703
704Print version information.
705
706=back
707
Akronc13a1702016-03-15 19:33:14 +0100708=head1 ANNOTATION SUPPORT
709
710L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
711developed in the KorAP project that are part of the KorAP preprocessing pipeline.
712The base foundry with paragraphs, sentences, and the text element are mandatory for
713L<Krill|https://github.com/KorAP/Krill>.
714
Akronf7ad89e2016-03-16 18:22:47 +0100715=over 2
Akronc13a1702016-03-15 19:33:14 +0100716
717=item B<Base>
718
719=over 4
720
Akronf7ad89e2016-03-16 18:22:47 +0100721=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100722
Akronf7ad89e2016-03-16 18:22:47 +0100723=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100724
725=back
726
727=item B<Connexor>
728
729=over 4
730
Akronf7ad89e2016-03-16 18:22:47 +0100731=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100732
Akronf7ad89e2016-03-16 18:22:47 +0100733=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100734
Akronf7ad89e2016-03-16 18:22:47 +0100735=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100736
Akronf7ad89e2016-03-16 18:22:47 +0100737=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100738
739=back
740
741=item B<CoreNLP>
742
743=over 4
744
Akronf7ad89e2016-03-16 18:22:47 +0100745=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100746
Akronf7ad89e2016-03-16 18:22:47 +0100747=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100748
Akronf7ad89e2016-03-16 18:22:47 +0100749=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100750
Akronf7ad89e2016-03-16 18:22:47 +0100751=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100752
753=back
754
755=item B<DeReKo>
756
757=over 4
758
Akronf7ad89e2016-03-16 18:22:47 +0100759=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100760
761=back
762
763=item B<Glemm>
764
765=over 4
766
Akronf7ad89e2016-03-16 18:22:47 +0100767=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100768
769=back
770
771=item B<Mate>
772
773=over 4
774
Akronf7ad89e2016-03-16 18:22:47 +0100775=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100776
Akronf7ad89e2016-03-16 18:22:47 +0100777=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100778
779=back
780
781=item B<OpenNLP>
782
783=over 4
784
Akronf7ad89e2016-03-16 18:22:47 +0100785=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100786
Akronf7ad89e2016-03-16 18:22:47 +0100787=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100788
789=back
790
791=item B<Sgbr>
792
793=over 4
794
Akronf7ad89e2016-03-16 18:22:47 +0100795=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100796
Akronf7ad89e2016-03-16 18:22:47 +0100797=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100798
799=back
800
801=item B<TreeTagger>
802
803=over 4
804
Akronf7ad89e2016-03-16 18:22:47 +0100805=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100806
Akronf7ad89e2016-03-16 18:22:47 +0100807=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100808
809=back
810
811=item B<XIP>
812
813=over 4
814
Akronf7ad89e2016-03-16 18:22:47 +0100815=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100816
Akronf7ad89e2016-03-16 18:22:47 +0100817=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100818
Akronf7ad89e2016-03-16 18:22:47 +0100819=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100820
821=back
822
823=back
824
825More importers are in preparation.
826New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
827See the built-in annotation importers as examples.
828
Akron941c1a62016-02-23 17:41:41 +0100829=head1 AVAILABILITY
830
831 https://github.com/KorAP/KorAP-XML-Krill
832
833
834=head1 COPYRIGHT AND LICENSE
835
836Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100837
Akron941c1a62016-02-23 17:41:41 +0100838Author: L<Nils Diewald|http://nils-diewald.de/>
839
840L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
841Corpus Analysis Platform at the
842L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
843member of the
844L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
845
846This program is free software published under the
847L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
848
849=cut