blob: cedd84e582c9f52b0ea6c6536fdf598efbbdbaa5 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron2fd402b2016-10-27 21:26:48 +020076# 1016/10/27
77# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020078#
Akron941c1a62016-02-23 17:41:41 +010079# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010080
Akron2fd402b2016-10-27 21:26:48 +020081our $LAST_CHANGE = '2016/10/27';
Akron941c1a62016-02-23 17:41:41 +010082our $LOCAL = $FindBin::Bin;
83our $VERSION_MSG = <<"VERSION";
84Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
85VERSION
86
Akron941c1a62016-02-23 17:41:41 +010087# Parse comand
88my $cmd;
89our @ARGV;
90if ($ARGV[0] && index($ARGV[0], '-') != 0) {
91 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010092};
Akron93d620e2016-02-05 19:40:05 +010093
Akron5f51d422016-08-16 16:26:43 +020094my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010095my $text;
Akrone10ad322016-02-27 10:54:26 +010096
Akron941c1a62016-02-23 17:41:41 +010097# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000098GetOptions(
Akron08385f62016-03-22 20:37:04 +010099 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100100 'output|o=s' => \(my $output),
101 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100102 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200103 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron941c1a62016-02-23 17:41:41 +0100104 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100105 'skip|s=s' => \@skip,
106 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100107 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100108 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200109 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100110 'primary|p!' => \(my $primary),
111 'pretty|y' => \(my $pretty),
112 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200113 'cache-size|cs=s' => \(my $cache_size = '50m'),
114 'cache-delete|cd!' => \(my $cache_delete = 1),
115 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100116 'help|h' => sub {
117 pod2usage(
118 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200119 -verbose => 99,
120 -msg => $VERSION_MSG,
121 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100122 );
123 },
124 'version|v' => sub {
125 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200126 -verbose => 0,
127 -msg => $VERSION_MSG,
128 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100129 )
130 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000131);
132
Akron941c1a62016-02-23 17:41:41 +0100133my %ERROR_HASH = (
134 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200135 -verbose => 99,
136 -msg => $VERSION_MSG,
137 -output => '-',
138 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100139);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000140
Akron941c1a62016-02-23 17:41:41 +0100141# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100142pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143
Akrone1dbc382016-07-08 22:24:52 +0200144# Gzip has no effect, if no output is given
145pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000146
Akron941c1a62016-02-23 17:41:41 +0100147# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000148Log::Log4perl->init({
149 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
150 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
151 'log4perl.appender.STDERR.layout' => 'PatternLayout',
152 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
153});
154
155my $log = Log::Log4perl->get_logger('main');
156
Akrone1dbc382016-07-08 22:24:52 +0200157my %skip;
158$skip{lc($_)} = 1 foreach @skip;
159
160my @layers;
161push(@layers, ['Base', 'Sentences']);
162push(@layers, ['Base', 'Paragraphs']);
163
164# Connexor
165push(@layers, ['Connexor', 'Morpho']);
166push(@layers, ['Connexor', 'Syntax']);
167push(@layers, ['Connexor', 'Phrase']);
168push(@layers, ['Connexor', 'Sentences']);
169
170# CoreNLP
171push(@layers, ['CoreNLP', 'NamedEntities']);
172push(@layers, ['CoreNLP', 'Sentences']);
173push(@layers, ['CoreNLP', 'Morpho']);
174push(@layers, ['CoreNLP', 'Constituency']);
175
176# DeReKo
177push(@layers, ['DeReKo', 'Structure']);
178
179# Glemm
180push(@layers, ['Glemm', 'Morpho']);
181
182# Malt
183push(@layers, ['Malt', 'Dependency']);
184
185# MDParser
186push(@layers, ['MDParser', 'Dependency']);
187
188# Mate
189push(@layers, ['Mate', 'Morpho']);
190push(@layers, ['Mate', 'Dependency']);
191
192# OpenNLP
193push(@layers, ['OpenNLP', 'Morpho']);
194push(@layers, ['OpenNLP', 'Sentences']);
195
196# Schreibgebrauch
197push(@layers, ['Sgbr', 'Lemma']);
198push(@layers, ['Sgbr', 'Morpho']);
199
200# TreeTagger
201push(@layers, ['TreeTagger', 'Morpho']);
202push(@layers, ['TreeTagger', 'Sentences']);
203
204# XIP
205push(@layers, ['XIP', 'Morpho']);
206push(@layers, ['XIP', 'Constituency']);
207push(@layers, ['XIP', 'Sentences']);
208push(@layers, ['XIP', 'Dependency']);
209
210# Check filters
211my @filtered_anno;
212if ($skip{'#all'}) {
213 foreach (@anno) {
214 push @filtered_anno, [ split('#', $_) ];
215 };
216}
217
218# Add all annotations that are not skipped
219else {
220 # Add to index file - respect skipping
221 foreach my $info (@layers) {
222 # Skip if Foundry or Foundry#Layer should be skipped
223 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
224 push @filtered_anno, $info;
225 };
226 };
227};
228
229# Get tokenization basis
230my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
231
232# TODO: This should not be initialized for batch
233my $cache = Cache::FastMmap->new(
234 share_file => $cache_file,
235 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200236 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200237);
238
Akron03b24db2016-08-16 20:54:32 +0200239# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200240my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200241 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200242 meta_type => $meta,
243 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200244 foundry => $token_base_foundry,
245 layer => $token_base_layer,
246 gzip => $gzip,
247 log => $log,
248 primary => $primary,
249 pretty => $pretty,
250 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200251);
252
Akron941c1a62016-02-23 17:41:41 +0100253
254# Get file name based on path information
255sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100256 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200257 if (-d $i) {
258 $i =~ s![^\/]+$!!;
259 };
Akron941c1a62016-02-23 17:41:41 +0100260 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200261
262 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200263 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100264 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100265 $file =~ tr/\//-/;
266 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200267 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100268 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000269};
270
Akrone10ad322016-02-27 10:54:26 +0100271# Convert sigle to path construct
272s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
273
Akron7d4cdd82016-08-17 21:39:45 +0200274if ($cmd) {
275 if ($output && (!-e $output || !-d $output)) {
276 print "Directory '$output' does not exist.\n\n";
277 exit(0);
278 };
279};
280
281
Akron941c1a62016-02-23 17:41:41 +0100282# Process a single file
283unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100284 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000285
Akron941c1a62016-02-23 17:41:41 +0100286 BEGIN {
287 $main::TIME = Benchmark->new;
288 $main::LAST_STOP = Benchmark->new;
289 };
290
291 sub stop_time {
292 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200293 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100294 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200295 timestr(timediff($new, $main::LAST_STOP)) .
296 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
297 );
Akron941c1a62016-02-23 17:41:41 +0100298 $main::LAST_STOP = $new;
299 };
300
301 # Create and parse new document
302 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100303
Akron7d4cdd82016-08-17 21:39:45 +0200304 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200305 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100306
Akron11c80302016-03-18 19:44:43 +0100307 # Delete cache file
308 unlink($cache_file) if $cache_delete;
309
Akron5f51d422016-08-16 16:26:43 +0200310 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000311}
Nils Diewald59094f22014-11-05 18:20:50 +0000312
Akrone10ad322016-02-27 10:54:26 +0100313# Extract XML files
314elsif ($cmd eq 'extract') {
315
Akron7d4cdd82016-08-17 21:39:45 +0200316 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200317 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100318
Akron7d4cdd82016-08-17 21:39:45 +0200319 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100320 unless ($archive->test_unzip) {
321 print "Unzip is not installed or incompatible.\n\n";
322 exit(1);
323 };
324
Akronb0c88db2016-06-29 16:33:18 +0200325 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200326 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200327
Akron651cb8d2016-08-16 21:44:49 +0200328 my $prefix = 1;
329
Akron03b24db2016-08-16 20:54:32 +0200330 # No sigles given
331 unless (@sigle) {
332
333 # Get files
334 foreach ($archive->list_texts) {
335
336 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200337 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200338
339 # TODO: Make this OS independent
340 push @sigle, join '/', $corpus, $doc, $text;
341 };
Akron20807582016-10-26 17:11:34 +0200342 }
343
344 # Check sigle for doc sigles
345 else {
346 my @new_sigle;
347
348 my $prefix_check = 0;
349
350 # Iterate over all sigle
351 foreach (@sigle) {
352
353 # Sigle is a doc sigle
354 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200355
Akron2812ba22016-10-28 21:55:59 +0200356 print "$_ ...\n";
Akron20807582016-10-26 17:11:34 +0200357 # Check if a prefix is needed
358 unless ($prefix_check) {
359 $prefix = $archive->check_prefix;
360 $prefix_check = 1;
361 };
362
363 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200364 my $path = ($prefix ? './' : '') . $_;
365
366 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200367 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200368 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200369 ) ? '' : 'not '
370 );
371 print "extracted.\n";
372 }
373 else {
374 push @new_sigle, $_;
375 };
376 };
377 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200378 };
379
Akrone10ad322016-02-27 10:54:26 +0100380 # Iterate over all given sigles and extract
381 foreach (@sigle) {
Akron2812ba22016-10-28 21:55:59 +0200382 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200383
Akron03b24db2016-08-16 20:54:32 +0200384 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200385 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200386 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200387 ($prefix ? './' : '') . $_, $output
388 ) ? '' : 'not '
389 );
Akrone10ad322016-02-27 10:54:26 +0100390 print "extracted.\n";
391 };
392
393 print "\n";
394 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200395 }
Akron7d4cdd82016-08-17 21:39:45 +0200396
397 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200398 else {
399 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100400 };
401}
402
Akron941c1a62016-02-23 17:41:41 +0100403# Process an archive
404elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000405
Akrone1dbc382016-07-08 22:24:52 +0200406 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100407
Akron7d4cdd82016-08-17 21:39:45 +0200408 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100409 my $pool = Parallel::ForkManager->new($jobs);
410
Akron7d4cdd82016-08-17 21:39:45 +0200411 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100412 my $iter = 1; # Current text in process
413
414 # Report on fork message
415 $pool->run_on_finish (
416 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200417 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100418 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200419
Akron08385f62016-03-22 20:37:04 +0100420 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200421 ($iter++) . "/$count]" .
422 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200423 ' ' . $data->[0] . "\n";
424 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100425 }
426 );
427
428 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200429 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100430 print "Reading data ...\n";
431
Akron7d4cdd82016-08-17 21:39:45 +0200432 # unless (Cache::FastMmap->new(
433 # share_file => $cache_file,
434 # cache_size => $cache_size,
435 # init_file => $cache_init
436 # )) {
437 # print "Unable to intialize cache '$cache_file'\n\n";
438 # exit(1);
439 # };
Akron11c80302016-03-18 19:44:43 +0100440
Akron941c1a62016-02-23 17:41:41 +0100441 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100442 if (-d $input[0]) {
443 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100444 my @dirs;
445 my $dir;
446
Akron7d4cdd82016-08-17 21:39:45 +0200447 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100448 while (1) {
449 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200450 push @dirs, $dir;
451 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100452 };
453 last unless $it->next;
454 };
455
456 print "Start processing ...\n";
457 $t = Benchmark->new;
458 $count = scalar @dirs;
459
460 DIRECTORY_LOOP:
461 for (my $i = 0; $i < $count; $i++) {
462
Akrone1dbc382016-07-08 22:24:52 +0200463 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200464 $output,
465 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200466 );
Akron941c1a62016-02-23 17:41:41 +0100467
468 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200469 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200470
471 if ($batch_file->process($dirs[$i] => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200472 $pool->finish(0, ["Processed " . $filename]);
Akron3ec48972016-08-17 23:24:52 +0200473 }
474 else {
Akron4c0cf312016-10-15 16:42:09 +0200475 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200476 };
Akron941c1a62016-02-23 17:41:41 +0100477 };
478 }
479
480 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200481 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200482
Akron941c1a62016-02-23 17:41:41 +0100483 unless ($archive->test_unzip) {
484 print "Unzip is not installed or incompatible.\n\n";
485 exit(1);
486 };
487
Akron08385f62016-03-22 20:37:04 +0100488 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200489 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100490
Akron941c1a62016-02-23 17:41:41 +0100491 print "Start processing ...\n";
492 $t = Benchmark->new;
493 my @dirs = $archive->list_texts;
494 $count = scalar @dirs;
495
496 ARCHIVE_LOOP:
497 for (my $i = 0; $i < $count; $i++) {
498
499 # Split path information
500 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
501
Akrone1dbc382016-07-08 22:24:52 +0200502 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200503 $output,
504 get_file_name(
505 catfile($corpus, $doc, $text)
506 . '.json' . ($gzip ? '.gz' : '')
507 )
Akrone1dbc382016-07-08 22:24:52 +0200508 );
Akron941c1a62016-02-23 17:41:41 +0100509
510 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200511 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100512
Akron4c0cf312016-10-15 16:42:09 +0200513 # Create temporary file
514 $temp = File::Temp->newdir;
515
Akronbdf434a2016-10-24 17:42:07 +0200516 # TODO: Check if $filename exist at the beginning,
517 # because extraction can be horrible slow!
518
Akron941c1a62016-02-23 17:41:41 +0100519 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200520 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100521
Akron7d4cdd82016-08-17 21:39:45 +0200522 # Create corpus directory
523 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100524
Akron7d4cdd82016-08-17 21:39:45 +0200525 # Temporary directory
526 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100527
Akron7d4cdd82016-08-17 21:39:45 +0200528 # Write file
529 if ($batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200530 # Delete temporary file
531 $pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200532 }
533 else {
Akron4c0cf312016-10-15 16:42:09 +0200534 # Delete temporary file
535 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200536 };
Akron941c1a62016-02-23 17:41:41 +0100537 }
Akron7d4cdd82016-08-17 21:39:45 +0200538
539 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100540 else {
Akron4c0cf312016-10-15 16:42:09 +0200541 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100542 };
543 };
544 }
545
546 else {
547 print "Input is neither a directory nor an archive.\n\n";
548 };
549
550 $pool->wait_all_children;
551
Akron11c80302016-03-18 19:44:43 +0100552 # Delete cache file
553 unlink($cache_file) if $cache_delete;
554
Akron941c1a62016-02-23 17:41:41 +0100555 print "Done.\n";
556 print timestr(timediff(Benchmark->new, $t))."\n\n";
557}
558
559# Unknown command
560else {
561 warn "Unknown command '$cmd'.\n\n";
562 pod2usage(%ERROR_HASH);
563}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000564
565__END__
Akron941c1a62016-02-23 17:41:41 +0100566
567=pod
568
569=encoding utf8
570
571=head1 NAME
572
Akronf7ad89e2016-03-16 18:22:47 +0100573korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100574
575
576=head1 SYNOPSIS
577
Akrona76d8352016-10-27 16:27:32 +0200578 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100579
Akron2fd402b2016-10-27 21:26:48 +0200580
Akron941c1a62016-02-23 17:41:41 +0100581=head1 DESCRIPTION
582
583L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
584compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100585The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100586
587
588=head1 INSTALLATION
589
590The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
591
Akronaf386982016-10-12 00:33:25 +0200592 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100593
Akronc13a1702016-03-15 19:33:14 +0100594In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100595be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200596Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200597In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100598
599=head1 ARGUMENTS
600
Akrona76d8352016-10-27 16:27:32 +0200601 $ korapxml2krill -z --input <directory> --output <filename>
602
603Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200604It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200605
Akron941c1a62016-02-23 17:41:41 +0100606=over 2
607
608=item B<archive>
609
Akrona76d8352016-10-27 16:27:32 +0200610 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
611
Akron2fd402b2016-10-27 21:26:48 +0200612Converts an archive of KorAP-XML documents. It expects a directory
613(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100614
615=item B<extract>
616
Akrona76d8352016-10-27 16:27:32 +0200617 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
618
619Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100620
621=back
622
623
624=head1 OPTIONS
625
626=over 2
627
Akrona76d8352016-10-27 16:27:32 +0200628=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100629
Akrona76d8352016-10-27 16:27:32 +0200630Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100631
Akron7606afa2016-10-25 16:23:49 +0200632Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akrona76d8352016-10-27 16:27:32 +0200633document, while C<archive> and C<extract> support zip files as well.
Akron7606afa2016-10-25 16:23:49 +0200634
Akrona76d8352016-10-27 16:27:32 +0200635C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200636that the first archive listed contains all primary data files
637and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200638
Akron7606afa2016-10-25 16:23:49 +0200639 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200640
Akron0c3e3752016-06-28 15:55:53 +0200641(The directory structure follows the base directory format,
642that may include a C<.> root folder.
643In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200644need to be passed with a hash sign in front of the archive's name.
645This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200646
Akron7606afa2016-10-25 16:23:49 +0200647To support zip files, a version of C<unzip> needs to be installed that is
648compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200649
Akron7606afa2016-10-25 16:23:49 +0200650B<The root folder switch using the hash sign is experimental and
651may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200652
Akron941c1a62016-02-23 17:41:41 +0100653=item B<--output|-o> <directory|file>
654
655Output folder for archive processing or
656document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100657writes to C<STDOUT> by default
658(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100659
660=item B<--overwrite|-w>
661
662Overwrite files that already exist.
663
664=item B<--token|-t> <foundry>[#<file>]
665
666Define the default tokenization by specifying
667the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100668of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100669
670=item B<--skip|-s> <foundry>[#<layer>]
671
Akronf7ad89e2016-03-16 18:22:47 +0100672Skip specific annotations by specifying the foundry
673(and optionally the layer with a C<#>-prefix),
674e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100675Can be set multiple times.
676
Akronc13a1702016-03-15 19:33:14 +0100677=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100678
Akronf7ad89e2016-03-16 18:22:47 +0100679Convert specific annotations by specifying the foundry
680(and optionally the layer with a C<#>-prefix),
681e.g. C<Mate> or C<Mate#Morpho>.
682Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100683
684=item B<--primary|-p>
685
Akronc13a1702016-03-15 19:33:14 +0100686Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100687Can be flagged using C<--no-primary> as well.
688This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100689
690=item B<--jobs|-j>
691
692Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100693for archive processing.
Akron11c80302016-03-18 19:44:43 +0100694Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100695This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100696
Akron35db6e32016-03-17 22:42:22 +0100697=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100698
Akron35db6e32016-03-17 22:42:22 +0100699Define the metadata parser to use. Defaults to C<I5>.
700Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
701This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100702
703=item B<--pretty|-y>
704
Akronc13a1702016-03-15 19:33:14 +0100705Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100706This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100707
708=item B<--gzip|-z>
709
Akronf7ad89e2016-03-16 18:22:47 +0100710Compress the output.
711Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100712
Akron11c80302016-03-18 19:44:43 +0100713=item B<--cache|-c>
714
715File to mmap a cache (using L<Cache::FastMmap>).
716Defaults to C<korapxml2krill.cache> in the calling directory.
717
718=item B<--cache-size|-cs>
719
720Size of the cache. Defaults to C<50m>.
721
722=item B<--cache-init|-ci>
723
724Initialize cache file.
725Can be flagged using C<--no-cache-init> as well.
726Defaults to C<true>.
727
728=item B<--cache-delete|-cd>
729
730Delete cache file after processing.
731Can be flagged using C<--no-cache-delete> as well.
732Defaults to C<true>.
733
Akrone10ad322016-02-27 10:54:26 +0100734=item B<--sigle|-sg>
735
Akron20807582016-10-26 17:11:34 +0200736Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100737Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100738I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200739Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200740In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200741On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100742
Akron941c1a62016-02-23 17:41:41 +0100743=item B<--log|-l>
744
745The L<Log4perl> log level, defaults to C<ERROR>.
746
747=item B<--help|-h>
748
749Print this document.
750
751=item B<--version|-v>
752
753Print version information.
754
755=back
756
Akronc13a1702016-03-15 19:33:14 +0100757=head1 ANNOTATION SUPPORT
758
759L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
760developed in the KorAP project that are part of the KorAP preprocessing pipeline.
761The base foundry with paragraphs, sentences, and the text element are mandatory for
762L<Krill|https://github.com/KorAP/Krill>.
763
Akronf7ad89e2016-03-16 18:22:47 +0100764=over 2
Akronc13a1702016-03-15 19:33:14 +0100765
766=item B<Base>
767
768=over 4
769
Akronf7ad89e2016-03-16 18:22:47 +0100770=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100771
Akronf7ad89e2016-03-16 18:22:47 +0100772=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100773
774=back
775
776=item B<Connexor>
777
778=over 4
779
Akronf7ad89e2016-03-16 18:22:47 +0100780=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100781
Akronf7ad89e2016-03-16 18:22:47 +0100782=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100783
Akronf7ad89e2016-03-16 18:22:47 +0100784=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100785
Akronf7ad89e2016-03-16 18:22:47 +0100786=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100787
788=back
789
790=item B<CoreNLP>
791
792=over 4
793
Akronf7ad89e2016-03-16 18:22:47 +0100794=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100795
Akronf7ad89e2016-03-16 18:22:47 +0100796=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100797
Akronf7ad89e2016-03-16 18:22:47 +0100798=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100799
Akronf7ad89e2016-03-16 18:22:47 +0100800=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100801
802=back
803
804=item B<DeReKo>
805
806=over 4
807
Akronf7ad89e2016-03-16 18:22:47 +0100808=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100809
810=back
811
812=item B<Glemm>
813
814=over 4
815
Akronf7ad89e2016-03-16 18:22:47 +0100816=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100817
818=back
819
820=item B<Mate>
821
822=over 4
823
Akronf7ad89e2016-03-16 18:22:47 +0100824=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100825
Akronf7ad89e2016-03-16 18:22:47 +0100826=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100827
828=back
829
830=item B<OpenNLP>
831
832=over 4
833
Akronf7ad89e2016-03-16 18:22:47 +0100834=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100835
Akronf7ad89e2016-03-16 18:22:47 +0100836=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100837
838=back
839
840=item B<Sgbr>
841
842=over 4
843
Akronf7ad89e2016-03-16 18:22:47 +0100844=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100845
Akronf7ad89e2016-03-16 18:22:47 +0100846=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100847
848=back
849
850=item B<TreeTagger>
851
852=over 4
853
Akronf7ad89e2016-03-16 18:22:47 +0100854=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100855
Akronf7ad89e2016-03-16 18:22:47 +0100856=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100857
858=back
859
860=item B<XIP>
861
862=over 4
863
Akronf7ad89e2016-03-16 18:22:47 +0100864=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100865
Akronf7ad89e2016-03-16 18:22:47 +0100866=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100867
Akronf7ad89e2016-03-16 18:22:47 +0100868=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100869
870=back
871
872=back
873
874More importers are in preparation.
875New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
876See the built-in annotation importers as examples.
877
Akron941c1a62016-02-23 17:41:41 +0100878=head1 AVAILABILITY
879
880 https://github.com/KorAP/KorAP-XML-Krill
881
882
883=head1 COPYRIGHT AND LICENSE
884
885Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100886
Akron941c1a62016-02-23 17:41:41 +0100887Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200888Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100889
890L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
891Corpus Analysis Platform at the
892L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
893member of the
894L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
895
896This program is free software published under the
897L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
898
899=cut