blob: 2713342a38ccd71e2f02fd086a61220634765309 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akron941c1a62016-02-23 17:41:41 +010073# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010074
Nils Diewald0e489772016-10-24 15:16:52 +020075our $LAST_CHANGE = '2016/10/24';
Akron941c1a62016-02-23 17:41:41 +010076our $LOCAL = $FindBin::Bin;
77our $VERSION_MSG = <<"VERSION";
78Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
79VERSION
80
Akron941c1a62016-02-23 17:41:41 +010081# Parse comand
82my $cmd;
83our @ARGV;
84if ($ARGV[0] && index($ARGV[0], '-') != 0) {
85 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010086};
Akron93d620e2016-02-05 19:40:05 +010087
Akron5f51d422016-08-16 16:26:43 +020088my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010089my $text;
Akrone10ad322016-02-27 10:54:26 +010090
Akron941c1a62016-02-23 17:41:41 +010091# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000092GetOptions(
Akron08385f62016-03-22 20:37:04 +010093 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010094 'output|o=s' => \(my $output),
95 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010096 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +020097 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron941c1a62016-02-23 17:41:41 +010098 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010099 'skip|s=s' => \@skip,
100 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100101 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100102 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200103 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100104 'primary|p!' => \(my $primary),
105 'pretty|y' => \(my $pretty),
106 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200107 'cache-size|cs=s' => \(my $cache_size = '50m'),
108 'cache-delete|cd!' => \(my $cache_delete = 1),
109 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100110 'help|h' => sub {
111 pod2usage(
112 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200113 -verbose => 99,
114 -msg => $VERSION_MSG,
115 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100116 );
117 },
118 'version|v' => sub {
119 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200120 -verbose => 0,
121 -msg => $VERSION_MSG,
122 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100123 )
124 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000125);
126
Akron941c1a62016-02-23 17:41:41 +0100127my %ERROR_HASH = (
128 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200129 -verbose => 99,
130 -msg => $VERSION_MSG,
131 -output => '-',
132 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100133);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000134
Akron941c1a62016-02-23 17:41:41 +0100135# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100136pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000137
Akrone1dbc382016-07-08 22:24:52 +0200138# Gzip has no effect, if no output is given
139pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000140
Akron941c1a62016-02-23 17:41:41 +0100141# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000142Log::Log4perl->init({
143 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
144 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
145 'log4perl.appender.STDERR.layout' => 'PatternLayout',
146 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
147});
148
149my $log = Log::Log4perl->get_logger('main');
150
Akrone1dbc382016-07-08 22:24:52 +0200151my %skip;
152$skip{lc($_)} = 1 foreach @skip;
153
154my @layers;
155push(@layers, ['Base', 'Sentences']);
156push(@layers, ['Base', 'Paragraphs']);
157
158# Connexor
159push(@layers, ['Connexor', 'Morpho']);
160push(@layers, ['Connexor', 'Syntax']);
161push(@layers, ['Connexor', 'Phrase']);
162push(@layers, ['Connexor', 'Sentences']);
163
164# CoreNLP
165push(@layers, ['CoreNLP', 'NamedEntities']);
166push(@layers, ['CoreNLP', 'Sentences']);
167push(@layers, ['CoreNLP', 'Morpho']);
168push(@layers, ['CoreNLP', 'Constituency']);
169
170# DeReKo
171push(@layers, ['DeReKo', 'Structure']);
172
173# Glemm
174push(@layers, ['Glemm', 'Morpho']);
175
176# Malt
177push(@layers, ['Malt', 'Dependency']);
178
179# MDParser
180push(@layers, ['MDParser', 'Dependency']);
181
182# Mate
183push(@layers, ['Mate', 'Morpho']);
184push(@layers, ['Mate', 'Dependency']);
185
186# OpenNLP
187push(@layers, ['OpenNLP', 'Morpho']);
188push(@layers, ['OpenNLP', 'Sentences']);
189
190# Schreibgebrauch
191push(@layers, ['Sgbr', 'Lemma']);
192push(@layers, ['Sgbr', 'Morpho']);
193
194# TreeTagger
195push(@layers, ['TreeTagger', 'Morpho']);
196push(@layers, ['TreeTagger', 'Sentences']);
197
198# XIP
199push(@layers, ['XIP', 'Morpho']);
200push(@layers, ['XIP', 'Constituency']);
201push(@layers, ['XIP', 'Sentences']);
202push(@layers, ['XIP', 'Dependency']);
203
204# Check filters
205my @filtered_anno;
206if ($skip{'#all'}) {
207 foreach (@anno) {
208 push @filtered_anno, [ split('#', $_) ];
209 };
210}
211
212# Add all annotations that are not skipped
213else {
214 # Add to index file - respect skipping
215 foreach my $info (@layers) {
216 # Skip if Foundry or Foundry#Layer should be skipped
217 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
218 push @filtered_anno, $info;
219 };
220 };
221};
222
223# Get tokenization basis
224my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
225
226# TODO: This should not be initialized for batch
227my $cache = Cache::FastMmap->new(
228 share_file => $cache_file,
229 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200230 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200231);
232
Akron03b24db2016-08-16 20:54:32 +0200233# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200234my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200235 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200236 meta_type => $meta,
237 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200238 foundry => $token_base_foundry,
239 layer => $token_base_layer,
240 gzip => $gzip,
241 log => $log,
242 primary => $primary,
243 pretty => $pretty,
244 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200245);
246
Akron941c1a62016-02-23 17:41:41 +0100247
248# Get file name based on path information
249sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100250 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200251 if (-d $i) {
252 $i =~ s![^\/]+$!!;
253 };
Akron941c1a62016-02-23 17:41:41 +0100254 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200255
256 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200257 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100258 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100259 $file =~ tr/\//-/;
260 $file =~ s{^-+}{};
261 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000262};
263
Akrone10ad322016-02-27 10:54:26 +0100264# Convert sigle to path construct
265s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
266
Akron7d4cdd82016-08-17 21:39:45 +0200267if ($cmd) {
268 if ($output && (!-e $output || !-d $output)) {
269 print "Directory '$output' does not exist.\n\n";
270 exit(0);
271 };
272};
273
274
Akron941c1a62016-02-23 17:41:41 +0100275# Process a single file
276unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100277 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000278
Akron941c1a62016-02-23 17:41:41 +0100279 BEGIN {
280 $main::TIME = Benchmark->new;
281 $main::LAST_STOP = Benchmark->new;
282 };
283
284 sub stop_time {
285 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200286 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100287 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200288 timestr(timediff($new, $main::LAST_STOP)) .
289 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
290 );
Akron941c1a62016-02-23 17:41:41 +0100291 $main::LAST_STOP = $new;
292 };
293
294 # Create and parse new document
295 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100296
Akron7d4cdd82016-08-17 21:39:45 +0200297 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200298 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100299
Akron11c80302016-03-18 19:44:43 +0100300 # Delete cache file
301 unlink($cache_file) if $cache_delete;
302
Akron5f51d422016-08-16 16:26:43 +0200303 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000304}
Nils Diewald59094f22014-11-05 18:20:50 +0000305
Akrone10ad322016-02-27 10:54:26 +0100306# Extract XML files
307elsif ($cmd eq 'extract') {
308
Akron7d4cdd82016-08-17 21:39:45 +0200309 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200310 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100311
Akron7d4cdd82016-08-17 21:39:45 +0200312 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100313 unless ($archive->test_unzip) {
314 print "Unzip is not installed or incompatible.\n\n";
315 exit(1);
316 };
317
Akronb0c88db2016-06-29 16:33:18 +0200318 # Add further annotation archived
319 $archive->attach($_) foreach @input;
320
Akron651cb8d2016-08-16 21:44:49 +0200321 my $prefix = 1;
322
Akron03b24db2016-08-16 20:54:32 +0200323 # No sigles given
324 unless (@sigle) {
325
326 # Get files
327 foreach ($archive->list_texts) {
328
329 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200330 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200331
332 # TODO: Make this OS independent
333 push @sigle, join '/', $corpus, $doc, $text;
334 };
335 };
336
Akrone10ad322016-02-27 10:54:26 +0100337 # Iterate over all given sigles and extract
338 foreach (@sigle) {
339 print "$_ ";
Akron7d4cdd82016-08-17 21:39:45 +0200340
Akron03b24db2016-08-16 20:54:32 +0200341 # TODO: Make this OS independent
Akron651cb8d2016-08-16 21:44:49 +0200342 print '' . (
343 $archive->extract(
344 ($prefix ? './' : '') . $_, $output
345 ) ? '' : 'not '
346 );
Akrone10ad322016-02-27 10:54:26 +0100347 print "extracted.\n";
348 };
349
350 print "\n";
351 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200352 }
Akron7d4cdd82016-08-17 21:39:45 +0200353
354 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200355 else {
356 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100357 };
358}
359
Akron941c1a62016-02-23 17:41:41 +0100360# Process an archive
361elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000362
Akrone1dbc382016-07-08 22:24:52 +0200363 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100364
Akron7d4cdd82016-08-17 21:39:45 +0200365 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100366 my $pool = Parallel::ForkManager->new($jobs);
367
Akron7d4cdd82016-08-17 21:39:45 +0200368 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100369 my $iter = 1; # Current text in process
370
371 # Report on fork message
372 $pool->run_on_finish (
373 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200374 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100375 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200376
Akron08385f62016-03-22 20:37:04 +0100377 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200378 ($iter++) . "/$count]" .
379 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200380 ' ' . $data->[0] . "\n";
381 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100382 }
383 );
384
385 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200386 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100387 print "Reading data ...\n";
388
Akron7d4cdd82016-08-17 21:39:45 +0200389 # unless (Cache::FastMmap->new(
390 # share_file => $cache_file,
391 # cache_size => $cache_size,
392 # init_file => $cache_init
393 # )) {
394 # print "Unable to intialize cache '$cache_file'\n\n";
395 # exit(1);
396 # };
Akron11c80302016-03-18 19:44:43 +0100397
Akron941c1a62016-02-23 17:41:41 +0100398 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100399 if (-d $input[0]) {
400 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100401 my @dirs;
402 my $dir;
403
Akron7d4cdd82016-08-17 21:39:45 +0200404 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100405 while (1) {
406 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200407 push @dirs, $dir;
408 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100409 };
410 last unless $it->next;
411 };
412
413 print "Start processing ...\n";
414 $t = Benchmark->new;
415 $count = scalar @dirs;
416
417 DIRECTORY_LOOP:
418 for (my $i = 0; $i < $count; $i++) {
419
Akrone1dbc382016-07-08 22:24:52 +0200420 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200421 $output,
422 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200423 );
Akron941c1a62016-02-23 17:41:41 +0100424
425 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200426 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200427
428 if ($batch_file->process($dirs[$i] => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200429 $pool->finish(0, ["Processed " . $filename]);
Akron3ec48972016-08-17 23:24:52 +0200430 }
431 else {
Akron4c0cf312016-10-15 16:42:09 +0200432 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200433 };
Akron941c1a62016-02-23 17:41:41 +0100434 };
435 }
436
437 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200438 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200439
Akron941c1a62016-02-23 17:41:41 +0100440 unless ($archive->test_unzip) {
441 print "Unzip is not installed or incompatible.\n\n";
442 exit(1);
443 };
444
Akron08385f62016-03-22 20:37:04 +0100445 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200446 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100447
Akron941c1a62016-02-23 17:41:41 +0100448 print "Start processing ...\n";
449 $t = Benchmark->new;
450 my @dirs = $archive->list_texts;
451 $count = scalar @dirs;
452
453 ARCHIVE_LOOP:
454 for (my $i = 0; $i < $count; $i++) {
455
456 # Split path information
457 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
458
Akrone1dbc382016-07-08 22:24:52 +0200459 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200460 $output,
461 get_file_name(
462 catfile($corpus, $doc, $text)
463 . '.json' . ($gzip ? '.gz' : '')
464 )
Akrone1dbc382016-07-08 22:24:52 +0200465 );
Akron941c1a62016-02-23 17:41:41 +0100466
467 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200468 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100469
Akron4c0cf312016-10-15 16:42:09 +0200470 # Create temporary file
471 $temp = File::Temp->newdir;
472
Akronbdf434a2016-10-24 17:42:07 +0200473 # TODO: Check if $filename exist at the beginning,
474 # because extraction can be horrible slow!
475
Akron941c1a62016-02-23 17:41:41 +0100476 # Extract from archive
477 if ($archive->extract($dirs[$i], $temp)) {
478
Akron7d4cdd82016-08-17 21:39:45 +0200479 # Create corpus directory
480 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100481
Akron7d4cdd82016-08-17 21:39:45 +0200482 # Temporary directory
483 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100484
Akron7d4cdd82016-08-17 21:39:45 +0200485 # Write file
486 if ($batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200487 # Delete temporary file
488 $pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200489 }
490 else {
Akron4c0cf312016-10-15 16:42:09 +0200491 # Delete temporary file
492 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200493 };
Akron941c1a62016-02-23 17:41:41 +0100494 }
Akron7d4cdd82016-08-17 21:39:45 +0200495
496 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100497 else {
Akron4c0cf312016-10-15 16:42:09 +0200498 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100499 };
500 };
501 }
502
503 else {
504 print "Input is neither a directory nor an archive.\n\n";
505 };
506
507 $pool->wait_all_children;
508
Akron11c80302016-03-18 19:44:43 +0100509 # Delete cache file
510 unlink($cache_file) if $cache_delete;
511
Akron941c1a62016-02-23 17:41:41 +0100512 print "Done.\n";
513 print timestr(timediff(Benchmark->new, $t))."\n\n";
514}
515
516# Unknown command
517else {
518 warn "Unknown command '$cmd'.\n\n";
519 pod2usage(%ERROR_HASH);
520}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000521
522__END__
Akron941c1a62016-02-23 17:41:41 +0100523
524=pod
525
526=encoding utf8
527
528=head1 NAME
529
Akronf7ad89e2016-03-16 18:22:47 +0100530korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100531
532
533=head1 SYNOPSIS
534
Akronc13a1702016-03-15 19:33:14 +0100535 $ korapxml2krill -z --input <directory> --output <filename>
Akron7606afa2016-10-25 16:23:49 +0200536 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
537 $ korapxml2krill extract --input <directory|archive> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100538
539
540=head1 DESCRIPTION
541
542L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
543compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100544The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100545
546
547=head1 INSTALLATION
548
549The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
550
Akronaf386982016-10-12 00:33:25 +0200551 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100552
Akronc13a1702016-03-15 19:33:14 +0100553In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100554be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200555Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200556In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100557
558=head1 ARGUMENTS
559
Akron7606afa2016-10-25 16:23:49 +0200560Without arguments, C<korapxml2krill> processes a directory of a single KorAP-XML document.
561
Akron941c1a62016-02-23 17:41:41 +0100562=over 2
563
564=item B<archive>
565
Akron7606afa2016-10-25 16:23:49 +0200566Processes an archive as a Zip-file or a folder of KorAP-XML documents.
Akrone10ad322016-02-27 10:54:26 +0100567
568=item B<extract>
569
Akron7606afa2016-10-25 16:23:49 +0200570Extracts KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100571
572=back
573
574
575=head1 OPTIONS
576
577=over 2
578
Akron2cfe8092016-06-24 17:48:49 +0200579=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100580
Akronf7ad89e2016-03-16 18:22:47 +0100581Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100582
Akron7606afa2016-10-25 16:23:49 +0200583Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
584document, while C<archive> and C<extract> support zip archives as well.
585
586C<archive> supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200587that the first archive listed contains all primary data files
588and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200589
Akron7606afa2016-10-25 16:23:49 +0200590 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200591
Akron0c3e3752016-06-28 15:55:53 +0200592(The directory structure follows the base directory format,
593that may include a C<.> root folder.
594In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200595need to be passed with a hash sign in front of the archive's name.
596This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200597
Akron7606afa2016-10-25 16:23:49 +0200598To support zip files, a version of C<unzip> needs to be installed that is
599compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200600
Akron7606afa2016-10-25 16:23:49 +0200601B<The root folder switch using the hash sign is experimental and
602may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200603
Akron941c1a62016-02-23 17:41:41 +0100604=item B<--output|-o> <directory|file>
605
606Output folder for archive processing or
607document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100608writes to C<STDOUT> by default
609(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100610
611=item B<--overwrite|-w>
612
613Overwrite files that already exist.
614
615=item B<--token|-t> <foundry>[#<file>]
616
617Define the default tokenization by specifying
618the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100619of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100620
621=item B<--skip|-s> <foundry>[#<layer>]
622
Akronf7ad89e2016-03-16 18:22:47 +0100623Skip specific annotations by specifying the foundry
624(and optionally the layer with a C<#>-prefix),
625e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100626Can be set multiple times.
627
Akronc13a1702016-03-15 19:33:14 +0100628=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100629
Akronf7ad89e2016-03-16 18:22:47 +0100630Convert specific annotations by specifying the foundry
631(and optionally the layer with a C<#>-prefix),
632e.g. C<Mate> or C<Mate#Morpho>.
633Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100634
635=item B<--primary|-p>
636
Akronc13a1702016-03-15 19:33:14 +0100637Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100638Can be flagged using C<--no-primary> as well.
639This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100640
641=item B<--jobs|-j>
642
643Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100644for archive processing.
Akron11c80302016-03-18 19:44:43 +0100645Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100646This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100647
Akron35db6e32016-03-17 22:42:22 +0100648=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100649
Akron35db6e32016-03-17 22:42:22 +0100650Define the metadata parser to use. Defaults to C<I5>.
651Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
652This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100653
654=item B<--pretty|-y>
655
Akronc13a1702016-03-15 19:33:14 +0100656Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100657This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100658
659=item B<--gzip|-z>
660
Akronf7ad89e2016-03-16 18:22:47 +0100661Compress the output.
662Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100663
Akron11c80302016-03-18 19:44:43 +0100664=item B<--cache|-c>
665
666File to mmap a cache (using L<Cache::FastMmap>).
667Defaults to C<korapxml2krill.cache> in the calling directory.
668
669=item B<--cache-size|-cs>
670
671Size of the cache. Defaults to C<50m>.
672
673=item B<--cache-init|-ci>
674
675Initialize cache file.
676Can be flagged using C<--no-cache-init> as well.
677Defaults to C<true>.
678
679=item B<--cache-delete|-cd>
680
681Delete cache file after processing.
682Can be flagged using C<--no-cache-delete> as well.
683Defaults to C<true>.
684
Akrone10ad322016-02-27 10:54:26 +0100685=item B<--sigle|-sg>
686
687Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100688Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100689I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200690Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100691
Akron941c1a62016-02-23 17:41:41 +0100692=item B<--log|-l>
693
694The L<Log4perl> log level, defaults to C<ERROR>.
695
696=item B<--help|-h>
697
698Print this document.
699
700=item B<--version|-v>
701
702Print version information.
703
704=back
705
Akronc13a1702016-03-15 19:33:14 +0100706=head1 ANNOTATION SUPPORT
707
708L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
709developed in the KorAP project that are part of the KorAP preprocessing pipeline.
710The base foundry with paragraphs, sentences, and the text element are mandatory for
711L<Krill|https://github.com/KorAP/Krill>.
712
Akronf7ad89e2016-03-16 18:22:47 +0100713=over 2
Akronc13a1702016-03-15 19:33:14 +0100714
715=item B<Base>
716
717=over 4
718
Akronf7ad89e2016-03-16 18:22:47 +0100719=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100720
Akronf7ad89e2016-03-16 18:22:47 +0100721=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100722
723=back
724
725=item B<Connexor>
726
727=over 4
728
Akronf7ad89e2016-03-16 18:22:47 +0100729=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100730
Akronf7ad89e2016-03-16 18:22:47 +0100731=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100732
Akronf7ad89e2016-03-16 18:22:47 +0100733=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100734
Akronf7ad89e2016-03-16 18:22:47 +0100735=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100736
737=back
738
739=item B<CoreNLP>
740
741=over 4
742
Akronf7ad89e2016-03-16 18:22:47 +0100743=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100744
Akronf7ad89e2016-03-16 18:22:47 +0100745=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100746
Akronf7ad89e2016-03-16 18:22:47 +0100747=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100748
Akronf7ad89e2016-03-16 18:22:47 +0100749=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100750
751=back
752
753=item B<DeReKo>
754
755=over 4
756
Akronf7ad89e2016-03-16 18:22:47 +0100757=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100758
759=back
760
761=item B<Glemm>
762
763=over 4
764
Akronf7ad89e2016-03-16 18:22:47 +0100765=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100766
767=back
768
769=item B<Mate>
770
771=over 4
772
Akronf7ad89e2016-03-16 18:22:47 +0100773=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100774
Akronf7ad89e2016-03-16 18:22:47 +0100775=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100776
777=back
778
779=item B<OpenNLP>
780
781=over 4
782
Akronf7ad89e2016-03-16 18:22:47 +0100783=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100784
Akronf7ad89e2016-03-16 18:22:47 +0100785=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100786
787=back
788
789=item B<Sgbr>
790
791=over 4
792
Akronf7ad89e2016-03-16 18:22:47 +0100793=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100794
Akronf7ad89e2016-03-16 18:22:47 +0100795=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100796
797=back
798
799=item B<TreeTagger>
800
801=over 4
802
Akronf7ad89e2016-03-16 18:22:47 +0100803=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100804
Akronf7ad89e2016-03-16 18:22:47 +0100805=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100806
807=back
808
809=item B<XIP>
810
811=over 4
812
Akronf7ad89e2016-03-16 18:22:47 +0100813=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100814
Akronf7ad89e2016-03-16 18:22:47 +0100815=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100816
Akronf7ad89e2016-03-16 18:22:47 +0100817=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100818
819=back
820
821=back
822
823More importers are in preparation.
824New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
825See the built-in annotation importers as examples.
826
Akron941c1a62016-02-23 17:41:41 +0100827=head1 AVAILABILITY
828
829 https://github.com/KorAP/KorAP-XML-Krill
830
831
832=head1 COPYRIGHT AND LICENSE
833
834Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100835
Akron941c1a62016-02-23 17:41:41 +0100836Author: L<Nils Diewald|http://nils-diewald.de/>
837
838L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
839Corpus Analysis Platform at the
840L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
841member of the
842L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
843
844This program is free software published under the
845L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
846
847=cut