blob: a66565eadf0c7a98f92ee47c605bddcdaf481596 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron941c1a62016-02-23 17:41:41 +010066# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010067
Akron5f51d422016-08-16 16:26:43 +020068our $LAST_CHANGE = '2016/08/16';
Akron941c1a62016-02-23 17:41:41 +010069our $LOCAL = $FindBin::Bin;
70our $VERSION_MSG = <<"VERSION";
71Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
72VERSION
73
Akron941c1a62016-02-23 17:41:41 +010074# Parse comand
75my $cmd;
76our @ARGV;
77if ($ARGV[0] && index($ARGV[0], '-') != 0) {
78 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010079};
Akron93d620e2016-02-05 19:40:05 +010080
Akron5f51d422016-08-16 16:26:43 +020081my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010082my $text;
Akrone10ad322016-02-27 10:54:26 +010083
Akron941c1a62016-02-23 17:41:41 +010084# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000085GetOptions(
Akron08385f62016-03-22 20:37:04 +010086 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010087 'output|o=s' => \(my $output),
88 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010089 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +020090 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron941c1a62016-02-23 17:41:41 +010091 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010092 'skip|s=s' => \@skip,
93 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010094 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +010095 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +020096 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +010097 'primary|p!' => \(my $primary),
98 'pretty|y' => \(my $pretty),
99 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200100 'cache-size|cs=s' => \(my $cache_size = '50m'),
101 'cache-delete|cd!' => \(my $cache_delete = 1),
102 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100103 'help|h' => sub {
104 pod2usage(
105 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200106 -verbose => 99,
107 -msg => $VERSION_MSG,
108 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100109 );
110 },
111 'version|v' => sub {
112 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200113 -verbose => 0,
114 -msg => $VERSION_MSG,
115 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100116 )
117 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000118);
119
Akron941c1a62016-02-23 17:41:41 +0100120my %ERROR_HASH = (
121 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200122 -verbose => 99,
123 -msg => $VERSION_MSG,
124 -output => '-',
125 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100126);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127
Akron941c1a62016-02-23 17:41:41 +0100128# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100129pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000130
Akrone1dbc382016-07-08 22:24:52 +0200131# Gzip has no effect, if no output is given
132pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000133
Akron941c1a62016-02-23 17:41:41 +0100134# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000135Log::Log4perl->init({
136 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
137 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
138 'log4perl.appender.STDERR.layout' => 'PatternLayout',
139 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
140});
141
142my $log = Log::Log4perl->get_logger('main');
143
Akrone1dbc382016-07-08 22:24:52 +0200144my %skip;
145$skip{lc($_)} = 1 foreach @skip;
146
147my @layers;
148push(@layers, ['Base', 'Sentences']);
149push(@layers, ['Base', 'Paragraphs']);
150
151# Connexor
152push(@layers, ['Connexor', 'Morpho']);
153push(@layers, ['Connexor', 'Syntax']);
154push(@layers, ['Connexor', 'Phrase']);
155push(@layers, ['Connexor', 'Sentences']);
156
157# CoreNLP
158push(@layers, ['CoreNLP', 'NamedEntities']);
159push(@layers, ['CoreNLP', 'Sentences']);
160push(@layers, ['CoreNLP', 'Morpho']);
161push(@layers, ['CoreNLP', 'Constituency']);
162
163# DeReKo
164push(@layers, ['DeReKo', 'Structure']);
165
166# Glemm
167push(@layers, ['Glemm', 'Morpho']);
168
169# Malt
170push(@layers, ['Malt', 'Dependency']);
171
172# MDParser
173push(@layers, ['MDParser', 'Dependency']);
174
175# Mate
176push(@layers, ['Mate', 'Morpho']);
177push(@layers, ['Mate', 'Dependency']);
178
179# OpenNLP
180push(@layers, ['OpenNLP', 'Morpho']);
181push(@layers, ['OpenNLP', 'Sentences']);
182
183# Schreibgebrauch
184push(@layers, ['Sgbr', 'Lemma']);
185push(@layers, ['Sgbr', 'Morpho']);
186
187# TreeTagger
188push(@layers, ['TreeTagger', 'Morpho']);
189push(@layers, ['TreeTagger', 'Sentences']);
190
191# XIP
192push(@layers, ['XIP', 'Morpho']);
193push(@layers, ['XIP', 'Constituency']);
194push(@layers, ['XIP', 'Sentences']);
195push(@layers, ['XIP', 'Dependency']);
196
197# Check filters
198my @filtered_anno;
199if ($skip{'#all'}) {
200 foreach (@anno) {
201 push @filtered_anno, [ split('#', $_) ];
202 };
203}
204
205# Add all annotations that are not skipped
206else {
207 # Add to index file - respect skipping
208 foreach my $info (@layers) {
209 # Skip if Foundry or Foundry#Layer should be skipped
210 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
211 push @filtered_anno, $info;
212 };
213 };
214};
215
216# Get tokenization basis
217my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
218
219# TODO: This should not be initialized for batch
220my $cache = Cache::FastMmap->new(
221 share_file => $cache_file,
222 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200223 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200224);
225
Akron03b24db2016-08-16 20:54:32 +0200226# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200227my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200228 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200229 meta_type => $meta,
230 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200231 foundry => $token_base_foundry,
232 layer => $token_base_layer,
233 gzip => $gzip,
234 log => $log,
235 primary => $primary,
236 pretty => $pretty,
237 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200238);
239
Akron941c1a62016-02-23 17:41:41 +0100240
241# Get file name based on path information
242sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100243 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200244 if (-d $i) {
245 $i =~ s![^\/]+$!!;
246 };
Akron941c1a62016-02-23 17:41:41 +0100247 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200248
249 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200250 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100251 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100252 $file =~ tr/\//-/;
253 $file =~ s{^-+}{};
254 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000255};
256
Akrone10ad322016-02-27 10:54:26 +0100257# Convert sigle to path construct
258s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
259
Akron7d4cdd82016-08-17 21:39:45 +0200260if ($cmd) {
261 if ($output && (!-e $output || !-d $output)) {
262 print "Directory '$output' does not exist.\n\n";
263 exit(0);
264 };
265};
266
267
Akron941c1a62016-02-23 17:41:41 +0100268# Process a single file
269unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100270 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000271
Akron941c1a62016-02-23 17:41:41 +0100272 BEGIN {
273 $main::TIME = Benchmark->new;
274 $main::LAST_STOP = Benchmark->new;
275 };
276
277 sub stop_time {
278 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200279 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100280 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200281 timestr(timediff($new, $main::LAST_STOP)) .
282 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
283 );
Akron941c1a62016-02-23 17:41:41 +0100284 $main::LAST_STOP = $new;
285 };
286
287 # Create and parse new document
288 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100289
Akron7d4cdd82016-08-17 21:39:45 +0200290 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200291 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100292
Akron11c80302016-03-18 19:44:43 +0100293 # Delete cache file
294 unlink($cache_file) if $cache_delete;
295
Akron5f51d422016-08-16 16:26:43 +0200296 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000297}
Nils Diewald59094f22014-11-05 18:20:50 +0000298
Akrone10ad322016-02-27 10:54:26 +0100299# Extract XML files
300elsif ($cmd eq 'extract') {
301
Akron7d4cdd82016-08-17 21:39:45 +0200302 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200303 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100304
Akron7d4cdd82016-08-17 21:39:45 +0200305 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100306 unless ($archive->test_unzip) {
307 print "Unzip is not installed or incompatible.\n\n";
308 exit(1);
309 };
310
Akronb0c88db2016-06-29 16:33:18 +0200311 # Add further annotation archived
312 $archive->attach($_) foreach @input;
313
Akron651cb8d2016-08-16 21:44:49 +0200314 my $prefix = 1;
315
Akron03b24db2016-08-16 20:54:32 +0200316 # No sigles given
317 unless (@sigle) {
318
319 # Get files
320 foreach ($archive->list_texts) {
321
322 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200323 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200324
325 # TODO: Make this OS independent
326 push @sigle, join '/', $corpus, $doc, $text;
327 };
328 };
329
Akrone10ad322016-02-27 10:54:26 +0100330 # Iterate over all given sigles and extract
331 foreach (@sigle) {
332 print "$_ ";
Akron7d4cdd82016-08-17 21:39:45 +0200333
Akron03b24db2016-08-16 20:54:32 +0200334 # TODO: Make this OS independent
Akron651cb8d2016-08-16 21:44:49 +0200335 print '' . (
336 $archive->extract(
337 ($prefix ? './' : '') . $_, $output
338 ) ? '' : 'not '
339 );
Akrone10ad322016-02-27 10:54:26 +0100340 print "extracted.\n";
341 };
342
343 print "\n";
344 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200345 }
Akron7d4cdd82016-08-17 21:39:45 +0200346
347 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200348 else {
349 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100350 };
351}
352
Akron941c1a62016-02-23 17:41:41 +0100353# Process an archive
354elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000355
Akrone1dbc382016-07-08 22:24:52 +0200356 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100357
Akron7d4cdd82016-08-17 21:39:45 +0200358 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100359 my $pool = Parallel::ForkManager->new($jobs);
360
Akron7d4cdd82016-08-17 21:39:45 +0200361 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100362 my $iter = 1; # Current text in process
363
364 # Report on fork message
365 $pool->run_on_finish (
366 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200367 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100368 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200369
Akron08385f62016-03-22 20:37:04 +0100370 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200371 ($iter++) . "/$count]" .
372 ($code ? " $code" : '') .
373 " $$data\n";
Akron941c1a62016-02-23 17:41:41 +0100374 }
375 );
376
377 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200378 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100379 print "Reading data ...\n";
380
Akron7d4cdd82016-08-17 21:39:45 +0200381 # unless (Cache::FastMmap->new(
382 # share_file => $cache_file,
383 # cache_size => $cache_size,
384 # init_file => $cache_init
385 # )) {
386 # print "Unable to intialize cache '$cache_file'\n\n";
387 # exit(1);
388 # };
Akron11c80302016-03-18 19:44:43 +0100389
Akron941c1a62016-02-23 17:41:41 +0100390 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100391 if (-d $input[0]) {
392 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100393 my @dirs;
394 my $dir;
395
Akron7d4cdd82016-08-17 21:39:45 +0200396 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100397 while (1) {
398 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200399 push @dirs, $dir;
400 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100401 };
402 last unless $it->next;
403 };
404
405 print "Start processing ...\n";
406 $t = Benchmark->new;
407 $count = scalar @dirs;
408
409 DIRECTORY_LOOP:
410 for (my $i = 0; $i < $count; $i++) {
411
Akrone1dbc382016-07-08 22:24:52 +0200412 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200413 $output,
414 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200415 );
Akron941c1a62016-02-23 17:41:41 +0100416
417 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200418 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200419
420 if ($batch_file->process($dirs[$i] => $filename)) {
421 $pool->finish(0, \("Processed " . $filename));
422 }
423 else {
424 $pool->finish(1, \("Unable to process " . $dirs[$i]));
425 };
Akron941c1a62016-02-23 17:41:41 +0100426 };
427 }
428
429 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200430 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200431
Akron941c1a62016-02-23 17:41:41 +0100432 unless ($archive->test_unzip) {
433 print "Unzip is not installed or incompatible.\n\n";
434 exit(1);
435 };
436
Akron08385f62016-03-22 20:37:04 +0100437 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200438 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100439
Akron941c1a62016-02-23 17:41:41 +0100440 print "Start processing ...\n";
441 $t = Benchmark->new;
442 my @dirs = $archive->list_texts;
443 $count = scalar @dirs;
444
Akron7d4cdd82016-08-17 21:39:45 +0200445 # Create temporary file
446 $temp = File::Temp->newdir;
447
Akron941c1a62016-02-23 17:41:41 +0100448 ARCHIVE_LOOP:
449 for (my $i = 0; $i < $count; $i++) {
450
451 # Split path information
452 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
453
Akrone1dbc382016-07-08 22:24:52 +0200454 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200455 $output,
456 get_file_name(
457 catfile($corpus, $doc, $text)
458 . '.json' . ($gzip ? '.gz' : '')
459 )
Akrone1dbc382016-07-08 22:24:52 +0200460 );
Akron941c1a62016-02-23 17:41:41 +0100461
462 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200463 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100464
Akron941c1a62016-02-23 17:41:41 +0100465 # Extract from archive
466 if ($archive->extract($dirs[$i], $temp)) {
467
Akron7d4cdd82016-08-17 21:39:45 +0200468 # Create corpus directory
469 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100470
Akron7d4cdd82016-08-17 21:39:45 +0200471 # Temporary directory
472 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100473
Akron7d4cdd82016-08-17 21:39:45 +0200474 # Write file
475 if ($batch_file->process($dir => $filename)) {
476 $pool->finish(0, \("Processed " . $filename));
477 }
478 else {
479 $pool->finish(1, \("Unable to process " . $dir));
480 };
Akron941c1a62016-02-23 17:41:41 +0100481 }
Akron7d4cdd82016-08-17 21:39:45 +0200482
483 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100484 else {
Akron3ec48972016-08-17 23:24:52 +0200485 $pool->finish(1, \("Unable to extract " . $dirs[$i]));
Akron941c1a62016-02-23 17:41:41 +0100486 };
487 };
488 }
489
490 else {
491 print "Input is neither a directory nor an archive.\n\n";
492 };
493
494 $pool->wait_all_children;
495
Akron7d4cdd82016-08-17 21:39:45 +0200496 # Delete temporary file
497 $temp = undef;
498
Akron11c80302016-03-18 19:44:43 +0100499 # Delete cache file
500 unlink($cache_file) if $cache_delete;
501
Akron941c1a62016-02-23 17:41:41 +0100502 print "Done.\n";
503 print timestr(timediff(Benchmark->new, $t))."\n\n";
504}
505
506# Unknown command
507else {
508 warn "Unknown command '$cmd'.\n\n";
509 pod2usage(%ERROR_HASH);
510}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000511
512__END__
Akron941c1a62016-02-23 17:41:41 +0100513
514=pod
515
516=encoding utf8
517
518=head1 NAME
519
Akronf7ad89e2016-03-16 18:22:47 +0100520korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100521
522
523=head1 SYNOPSIS
524
Akronc13a1702016-03-15 19:33:14 +0100525 $ korapxml2krill -z --input <directory> --output <filename>
526 $ korapxml2krill archive -z --input <directory> --output <directory>
527 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100528
529
530=head1 DESCRIPTION
531
532L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
533compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100534The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100535
536
537=head1 INSTALLATION
538
539The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
540
Akronaf386982016-10-12 00:33:25 +0200541 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100542
Akronc13a1702016-03-15 19:33:14 +0100543In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100544be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100545
546
547=head1 ARGUMENTS
548
549=over 2
550
551=item B<archive>
552
Akrone10ad322016-02-27 10:54:26 +0100553Process an archive as a Zip-file or a folder of KorAP-XML documents.
554
555=item B<extract>
556
557Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100558
559=back
560
561
562=head1 OPTIONS
563
564=over 2
565
Akron2cfe8092016-06-24 17:48:49 +0200566=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100567
Akronf7ad89e2016-03-16 18:22:47 +0100568Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100569
Akron0c3e3752016-06-28 15:55:53 +0200570Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200571that the first archive listed contains all primary data files
572and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200573
574 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
575
Akron0c3e3752016-06-28 15:55:53 +0200576(The directory structure follows the base directory format,
577that may include a C<.> root folder.
578In this case further archives lacking a C<.> root folder
579need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200580
Akron651cb8d2016-08-16 21:44:49 +0200581B<The root folder switch is experimental and may vanish in future versions.>
582
Akron941c1a62016-02-23 17:41:41 +0100583=item B<--output|-o> <directory|file>
584
585Output folder for archive processing or
586document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100587writes to C<STDOUT> by default
588(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100589
590=item B<--overwrite|-w>
591
592Overwrite files that already exist.
593
594=item B<--token|-t> <foundry>[#<file>]
595
596Define the default tokenization by specifying
597the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100598of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100599
600=item B<--skip|-s> <foundry>[#<layer>]
601
Akronf7ad89e2016-03-16 18:22:47 +0100602Skip specific annotations by specifying the foundry
603(and optionally the layer with a C<#>-prefix),
604e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100605Can be set multiple times.
606
Akronc13a1702016-03-15 19:33:14 +0100607=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100608
Akronf7ad89e2016-03-16 18:22:47 +0100609Convert specific annotations by specifying the foundry
610(and optionally the layer with a C<#>-prefix),
611e.g. C<Mate> or C<Mate#Morpho>.
612Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100613
614=item B<--primary|-p>
615
Akronc13a1702016-03-15 19:33:14 +0100616Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100617Can be flagged using C<--no-primary> as well.
618This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100619
620=item B<--jobs|-j>
621
622Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100623for archive processing.
Akron11c80302016-03-18 19:44:43 +0100624Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100625This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100626
Akron35db6e32016-03-17 22:42:22 +0100627=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100628
Akron35db6e32016-03-17 22:42:22 +0100629Define the metadata parser to use. Defaults to C<I5>.
630Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
631This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100632
633=item B<--pretty|-y>
634
Akronc13a1702016-03-15 19:33:14 +0100635Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100636This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100637
638=item B<--gzip|-z>
639
Akronf7ad89e2016-03-16 18:22:47 +0100640Compress the output.
641Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100642
Akron11c80302016-03-18 19:44:43 +0100643=item B<--cache|-c>
644
645File to mmap a cache (using L<Cache::FastMmap>).
646Defaults to C<korapxml2krill.cache> in the calling directory.
647
648=item B<--cache-size|-cs>
649
650Size of the cache. Defaults to C<50m>.
651
652=item B<--cache-init|-ci>
653
654Initialize cache file.
655Can be flagged using C<--no-cache-init> as well.
656Defaults to C<true>.
657
658=item B<--cache-delete|-cd>
659
660Delete cache file after processing.
661Can be flagged using C<--no-cache-delete> as well.
662Defaults to C<true>.
663
Akrone10ad322016-02-27 10:54:26 +0100664=item B<--sigle|-sg>
665
666Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100667Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100668I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200669Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100670
Akron941c1a62016-02-23 17:41:41 +0100671=item B<--log|-l>
672
673The L<Log4perl> log level, defaults to C<ERROR>.
674
675=item B<--help|-h>
676
677Print this document.
678
679=item B<--version|-v>
680
681Print version information.
682
683=back
684
Akronc13a1702016-03-15 19:33:14 +0100685=head1 ANNOTATION SUPPORT
686
687L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
688developed in the KorAP project that are part of the KorAP preprocessing pipeline.
689The base foundry with paragraphs, sentences, and the text element are mandatory for
690L<Krill|https://github.com/KorAP/Krill>.
691
Akronf7ad89e2016-03-16 18:22:47 +0100692=over 2
Akronc13a1702016-03-15 19:33:14 +0100693
694=item B<Base>
695
696=over 4
697
Akronf7ad89e2016-03-16 18:22:47 +0100698=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100699
Akronf7ad89e2016-03-16 18:22:47 +0100700=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100701
702=back
703
704=item B<Connexor>
705
706=over 4
707
Akronf7ad89e2016-03-16 18:22:47 +0100708=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100709
Akronf7ad89e2016-03-16 18:22:47 +0100710=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100711
Akronf7ad89e2016-03-16 18:22:47 +0100712=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100713
Akronf7ad89e2016-03-16 18:22:47 +0100714=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100715
716=back
717
718=item B<CoreNLP>
719
720=over 4
721
Akronf7ad89e2016-03-16 18:22:47 +0100722=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100723
Akronf7ad89e2016-03-16 18:22:47 +0100724=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100725
Akronf7ad89e2016-03-16 18:22:47 +0100726=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100727
Akronf7ad89e2016-03-16 18:22:47 +0100728=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100729
730=back
731
732=item B<DeReKo>
733
734=over 4
735
Akronf7ad89e2016-03-16 18:22:47 +0100736=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100737
738=back
739
740=item B<Glemm>
741
742=over 4
743
Akronf7ad89e2016-03-16 18:22:47 +0100744=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100745
746=back
747
748=item B<Mate>
749
750=over 4
751
Akronf7ad89e2016-03-16 18:22:47 +0100752=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100753
Akronf7ad89e2016-03-16 18:22:47 +0100754=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100755
756=back
757
758=item B<OpenNLP>
759
760=over 4
761
Akronf7ad89e2016-03-16 18:22:47 +0100762=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100763
Akronf7ad89e2016-03-16 18:22:47 +0100764=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100765
766=back
767
768=item B<Sgbr>
769
770=over 4
771
Akronf7ad89e2016-03-16 18:22:47 +0100772=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100773
Akronf7ad89e2016-03-16 18:22:47 +0100774=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100775
776=back
777
778=item B<TreeTagger>
779
780=over 4
781
Akronf7ad89e2016-03-16 18:22:47 +0100782=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100783
Akronf7ad89e2016-03-16 18:22:47 +0100784=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100785
786=back
787
788=item B<XIP>
789
790=over 4
791
Akronf7ad89e2016-03-16 18:22:47 +0100792=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100793
Akronf7ad89e2016-03-16 18:22:47 +0100794=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100795
Akronf7ad89e2016-03-16 18:22:47 +0100796=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100797
798=back
799
800=back
801
802More importers are in preparation.
803New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
804See the built-in annotation importers as examples.
805
Akron941c1a62016-02-23 17:41:41 +0100806=head1 AVAILABILITY
807
808 https://github.com/KorAP/KorAP-XML-Krill
809
810
811=head1 COPYRIGHT AND LICENSE
812
813Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100814
Akron941c1a62016-02-23 17:41:41 +0100815Author: L<Nils Diewald|http://nils-diewald.de/>
816
817L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
818Corpus Analysis Platform at the
819L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
820member of the
821L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
822
823This program is free software published under the
824L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
825
826=cut