blob: 27eb36006961cf7a3e4646dc04cd96e95a0b9aa0 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron2fd402b2016-10-27 21:26:48 +020076# 1016/10/27
77# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020078#
Akron941c1a62016-02-23 17:41:41 +010079# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010080
Akron2fd402b2016-10-27 21:26:48 +020081our $LAST_CHANGE = '2016/10/27';
Akron941c1a62016-02-23 17:41:41 +010082our $LOCAL = $FindBin::Bin;
83our $VERSION_MSG = <<"VERSION";
84Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
85VERSION
86
Akron941c1a62016-02-23 17:41:41 +010087# Parse comand
88my $cmd;
89our @ARGV;
90if ($ARGV[0] && index($ARGV[0], '-') != 0) {
91 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010092};
Akron93d620e2016-02-05 19:40:05 +010093
Akron5f51d422016-08-16 16:26:43 +020094my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010095my $text;
Akrone10ad322016-02-27 10:54:26 +010096
Akron941c1a62016-02-23 17:41:41 +010097# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000098GetOptions(
Akron08385f62016-03-22 20:37:04 +010099 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100100 'output|o=s' => \(my $output),
101 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100102 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200103 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron941c1a62016-02-23 17:41:41 +0100104 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100105 'skip|s=s' => \@skip,
106 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100107 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100108 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200109 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100110 'primary|p!' => \(my $primary),
111 'pretty|y' => \(my $pretty),
112 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200113 'cache-size|cs=s' => \(my $cache_size = '50m'),
114 'cache-delete|cd!' => \(my $cache_delete = 1),
115 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100116 'help|h' => sub {
117 pod2usage(
118 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200119 -verbose => 99,
120 -msg => $VERSION_MSG,
121 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100122 );
123 },
124 'version|v' => sub {
125 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200126 -verbose => 0,
127 -msg => $VERSION_MSG,
128 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100129 )
130 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000131);
132
Akron941c1a62016-02-23 17:41:41 +0100133my %ERROR_HASH = (
134 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200135 -verbose => 99,
136 -msg => $VERSION_MSG,
137 -output => '-',
138 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100139);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000140
Akron941c1a62016-02-23 17:41:41 +0100141# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100142pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143
Akrone1dbc382016-07-08 22:24:52 +0200144# Gzip has no effect, if no output is given
145pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000146
Akron941c1a62016-02-23 17:41:41 +0100147# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000148Log::Log4perl->init({
149 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
150 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
151 'log4perl.appender.STDERR.layout' => 'PatternLayout',
152 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
153});
154
155my $log = Log::Log4perl->get_logger('main');
156
Akrone1dbc382016-07-08 22:24:52 +0200157my %skip;
158$skip{lc($_)} = 1 foreach @skip;
159
160my @layers;
161push(@layers, ['Base', 'Sentences']);
162push(@layers, ['Base', 'Paragraphs']);
163
164# Connexor
165push(@layers, ['Connexor', 'Morpho']);
166push(@layers, ['Connexor', 'Syntax']);
167push(@layers, ['Connexor', 'Phrase']);
168push(@layers, ['Connexor', 'Sentences']);
169
170# CoreNLP
171push(@layers, ['CoreNLP', 'NamedEntities']);
172push(@layers, ['CoreNLP', 'Sentences']);
173push(@layers, ['CoreNLP', 'Morpho']);
174push(@layers, ['CoreNLP', 'Constituency']);
175
176# DeReKo
177push(@layers, ['DeReKo', 'Structure']);
178
179# Glemm
180push(@layers, ['Glemm', 'Morpho']);
181
182# Malt
183push(@layers, ['Malt', 'Dependency']);
184
185# MDParser
186push(@layers, ['MDParser', 'Dependency']);
187
188# Mate
189push(@layers, ['Mate', 'Morpho']);
190push(@layers, ['Mate', 'Dependency']);
191
192# OpenNLP
193push(@layers, ['OpenNLP', 'Morpho']);
194push(@layers, ['OpenNLP', 'Sentences']);
195
196# Schreibgebrauch
197push(@layers, ['Sgbr', 'Lemma']);
198push(@layers, ['Sgbr', 'Morpho']);
199
200# TreeTagger
201push(@layers, ['TreeTagger', 'Morpho']);
202push(@layers, ['TreeTagger', 'Sentences']);
203
204# XIP
205push(@layers, ['XIP', 'Morpho']);
206push(@layers, ['XIP', 'Constituency']);
207push(@layers, ['XIP', 'Sentences']);
208push(@layers, ['XIP', 'Dependency']);
209
210# Check filters
211my @filtered_anno;
212if ($skip{'#all'}) {
213 foreach (@anno) {
214 push @filtered_anno, [ split('#', $_) ];
215 };
216}
217
218# Add all annotations that are not skipped
219else {
220 # Add to index file - respect skipping
221 foreach my $info (@layers) {
222 # Skip if Foundry or Foundry#Layer should be skipped
223 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
224 push @filtered_anno, $info;
225 };
226 };
227};
228
229# Get tokenization basis
230my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
231
232# TODO: This should not be initialized for batch
233my $cache = Cache::FastMmap->new(
234 share_file => $cache_file,
235 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200236 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200237);
238
Akron03b24db2016-08-16 20:54:32 +0200239# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200240my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200241 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200242 meta_type => $meta,
243 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200244 foundry => $token_base_foundry,
245 layer => $token_base_layer,
246 gzip => $gzip,
247 log => $log,
248 primary => $primary,
249 pretty => $pretty,
250 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200251);
252
Akron941c1a62016-02-23 17:41:41 +0100253
254# Get file name based on path information
255sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100256 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200257 if (-d $i) {
258 $i =~ s![^\/]+$!!;
259 };
Akron941c1a62016-02-23 17:41:41 +0100260 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200261
262 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200263 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100264 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100265 $file =~ tr/\//-/;
266 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200267 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100268 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000269};
270
Akrone10ad322016-02-27 10:54:26 +0100271# Convert sigle to path construct
272s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
273
Akron7d4cdd82016-08-17 21:39:45 +0200274if ($cmd) {
275 if ($output && (!-e $output || !-d $output)) {
276 print "Directory '$output' does not exist.\n\n";
277 exit(0);
278 };
279};
280
281
Akron941c1a62016-02-23 17:41:41 +0100282# Process a single file
283unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100284 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000285
Akron941c1a62016-02-23 17:41:41 +0100286 BEGIN {
287 $main::TIME = Benchmark->new;
288 $main::LAST_STOP = Benchmark->new;
289 };
290
291 sub stop_time {
292 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200293 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100294 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200295 timestr(timediff($new, $main::LAST_STOP)) .
296 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
297 );
Akron941c1a62016-02-23 17:41:41 +0100298 $main::LAST_STOP = $new;
299 };
300
301 # Create and parse new document
302 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100303
Akron7d4cdd82016-08-17 21:39:45 +0200304 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200305 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100306
Akron11c80302016-03-18 19:44:43 +0100307 # Delete cache file
308 unlink($cache_file) if $cache_delete;
309
Akron5f51d422016-08-16 16:26:43 +0200310 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000311}
Nils Diewald59094f22014-11-05 18:20:50 +0000312
Akrone10ad322016-02-27 10:54:26 +0100313# Extract XML files
314elsif ($cmd eq 'extract') {
315
Akron7d4cdd82016-08-17 21:39:45 +0200316 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200317 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100318
Akron7d4cdd82016-08-17 21:39:45 +0200319 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100320 unless ($archive->test_unzip) {
321 print "Unzip is not installed or incompatible.\n\n";
322 exit(1);
323 };
324
Akronb0c88db2016-06-29 16:33:18 +0200325 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200326 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200327
Akron651cb8d2016-08-16 21:44:49 +0200328 my $prefix = 1;
329
Akron03b24db2016-08-16 20:54:32 +0200330 # No sigles given
331 unless (@sigle) {
332
333 # Get files
334 foreach ($archive->list_texts) {
335
336 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200337 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200338
339 # TODO: Make this OS independent
340 push @sigle, join '/', $corpus, $doc, $text;
341 };
Akron20807582016-10-26 17:11:34 +0200342 }
343
344 # Check sigle for doc sigles
345 else {
346 my @new_sigle;
347
348 my $prefix_check = 0;
349
350 # Iterate over all sigle
351 foreach (@sigle) {
352
353 # Sigle is a doc sigle
354 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200355
Akron2812ba22016-10-28 21:55:59 +0200356 print "$_ ...\n";
Akron20807582016-10-26 17:11:34 +0200357 # Check if a prefix is needed
358 unless ($prefix_check) {
359 $prefix = $archive->check_prefix;
360 $prefix_check = 1;
361 };
362
363 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200364 my $path = ($prefix ? './' : '') . $_;
365
366 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200367 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200368 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200369 ) ? '' : 'not '
370 );
371 print "extracted.\n";
372 }
373 else {
374 push @new_sigle, $_;
375 };
376 };
377 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200378 };
379
Akrone10ad322016-02-27 10:54:26 +0100380 # Iterate over all given sigles and extract
381 foreach (@sigle) {
Akron2812ba22016-10-28 21:55:59 +0200382 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200383
Akron03b24db2016-08-16 20:54:32 +0200384 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200385 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200386 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200387 ($prefix ? './' : '') . $_, $output
388 ) ? '' : 'not '
389 );
Akrone10ad322016-02-27 10:54:26 +0100390 print "extracted.\n";
391 };
392
393 print "\n";
394 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200395 }
Akron7d4cdd82016-08-17 21:39:45 +0200396
397 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200398 else {
399 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100400 };
401}
402
Akron941c1a62016-02-23 17:41:41 +0100403# Process an archive
404elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000405
Akrone1dbc382016-07-08 22:24:52 +0200406 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100407
Akron7d4cdd82016-08-17 21:39:45 +0200408 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100409 my $pool = Parallel::ForkManager->new($jobs);
410
Akron7d4cdd82016-08-17 21:39:45 +0200411 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100412 my $iter = 1; # Current text in process
413
414 # Report on fork message
415 $pool->run_on_finish (
416 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200417 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100418 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200419
Akron08385f62016-03-22 20:37:04 +0100420 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200421 ($iter++) . "/$count]" .
422 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200423 ' ' . $data->[0] . "\n";
424 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100425 }
426 );
427
428 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200429 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100430 print "Reading data ...\n";
431
Akron7d4cdd82016-08-17 21:39:45 +0200432 # unless (Cache::FastMmap->new(
433 # share_file => $cache_file,
434 # cache_size => $cache_size,
435 # init_file => $cache_init
436 # )) {
437 # print "Unable to intialize cache '$cache_file'\n\n";
438 # exit(1);
439 # };
Akron11c80302016-03-18 19:44:43 +0100440
Akron941c1a62016-02-23 17:41:41 +0100441 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100442 if (-d $input[0]) {
443 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100444 my @dirs;
445 my $dir;
446
Akron7d4cdd82016-08-17 21:39:45 +0200447 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100448 while (1) {
449 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200450 push @dirs, $dir;
451 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100452 };
453 last unless $it->next;
454 };
455
456 print "Start processing ...\n";
457 $t = Benchmark->new;
458 $count = scalar @dirs;
459
460 DIRECTORY_LOOP:
461 for (my $i = 0; $i < $count; $i++) {
462
Akrone1dbc382016-07-08 22:24:52 +0200463 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200464 $output,
465 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200466 );
Akron941c1a62016-02-23 17:41:41 +0100467
468 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200469 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200470
Akron13d56622016-10-31 14:54:49 +0100471 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
472 $pool->finish(
473 0,
474 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
475 );
Akron3ec48972016-08-17 23:24:52 +0200476 }
477 else {
Akron4c0cf312016-10-15 16:42:09 +0200478 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200479 };
Akron941c1a62016-02-23 17:41:41 +0100480 };
481 }
482
483 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200484 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200485
Akron941c1a62016-02-23 17:41:41 +0100486 unless ($archive->test_unzip) {
487 print "Unzip is not installed or incompatible.\n\n";
488 exit(1);
489 };
490
Akron08385f62016-03-22 20:37:04 +0100491 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200492 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100493
Akron941c1a62016-02-23 17:41:41 +0100494 print "Start processing ...\n";
495 $t = Benchmark->new;
496 my @dirs = $archive->list_texts;
497 $count = scalar @dirs;
498
499 ARCHIVE_LOOP:
500 for (my $i = 0; $i < $count; $i++) {
501
502 # Split path information
503 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
504
Akrone1dbc382016-07-08 22:24:52 +0200505 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200506 $output,
507 get_file_name(
508 catfile($corpus, $doc, $text)
509 . '.json' . ($gzip ? '.gz' : '')
510 )
Akrone1dbc382016-07-08 22:24:52 +0200511 );
Akron941c1a62016-02-23 17:41:41 +0100512
513 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200514 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100515
Akron4c0cf312016-10-15 16:42:09 +0200516 # Create temporary file
517 $temp = File::Temp->newdir;
518
Akronbdf434a2016-10-24 17:42:07 +0200519 # TODO: Check if $filename exist at the beginning,
520 # because extraction can be horrible slow!
521
Akron941c1a62016-02-23 17:41:41 +0100522 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200523 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100524
Akron7d4cdd82016-08-17 21:39:45 +0200525 # Create corpus directory
526 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100527
Akron7d4cdd82016-08-17 21:39:45 +0200528 # Temporary directory
529 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100530
Akron7d4cdd82016-08-17 21:39:45 +0200531 # Write file
Akron13d56622016-10-31 14:54:49 +0100532 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200533 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100534 $pool->finish(
535 0,
536 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
537 );
538 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200539 }
540 else {
Akron4c0cf312016-10-15 16:42:09 +0200541 # Delete temporary file
542 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200543 };
Akron941c1a62016-02-23 17:41:41 +0100544 }
Akron7d4cdd82016-08-17 21:39:45 +0200545
546 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100547 else {
Akron4c0cf312016-10-15 16:42:09 +0200548 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100549 };
550 };
551 }
552
553 else {
554 print "Input is neither a directory nor an archive.\n\n";
555 };
556
557 $pool->wait_all_children;
558
Akron11c80302016-03-18 19:44:43 +0100559 # Delete cache file
560 unlink($cache_file) if $cache_delete;
561
Akron941c1a62016-02-23 17:41:41 +0100562 print "Done.\n";
563 print timestr(timediff(Benchmark->new, $t))."\n\n";
564}
565
566# Unknown command
567else {
568 warn "Unknown command '$cmd'.\n\n";
569 pod2usage(%ERROR_HASH);
570}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000571
572__END__
Akron941c1a62016-02-23 17:41:41 +0100573
574=pod
575
576=encoding utf8
577
578=head1 NAME
579
Akronf7ad89e2016-03-16 18:22:47 +0100580korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100581
582
583=head1 SYNOPSIS
584
Akrona76d8352016-10-27 16:27:32 +0200585 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100586
Akron2fd402b2016-10-27 21:26:48 +0200587
Akron941c1a62016-02-23 17:41:41 +0100588=head1 DESCRIPTION
589
590L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
591compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100592The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100593
594
595=head1 INSTALLATION
596
597The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
598
Akronaf386982016-10-12 00:33:25 +0200599 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100600
Akronc13a1702016-03-15 19:33:14 +0100601In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100602be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200603Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200604In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100605
606=head1 ARGUMENTS
607
Akrona76d8352016-10-27 16:27:32 +0200608 $ korapxml2krill -z --input <directory> --output <filename>
609
610Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200611It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200612
Akron941c1a62016-02-23 17:41:41 +0100613=over 2
614
615=item B<archive>
616
Akrona76d8352016-10-27 16:27:32 +0200617 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
618
Akron2fd402b2016-10-27 21:26:48 +0200619Converts an archive of KorAP-XML documents. It expects a directory
620(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100621
622=item B<extract>
623
Akrona76d8352016-10-27 16:27:32 +0200624 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
625
626Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100627
628=back
629
630
631=head1 OPTIONS
632
633=over 2
634
Akrona76d8352016-10-27 16:27:32 +0200635=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100636
Akrona76d8352016-10-27 16:27:32 +0200637Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100638
Akron7606afa2016-10-25 16:23:49 +0200639Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100640document, while C<archive> expects a KorAP-XML corpus folder or a zip
641file to batch process multiple files.
642C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200643
Akrona76d8352016-10-27 16:27:32 +0200644C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200645that the first archive listed contains all primary data files
646and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200647
Akron7606afa2016-10-25 16:23:49 +0200648 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200649
Akron0c3e3752016-06-28 15:55:53 +0200650(The directory structure follows the base directory format,
651that may include a C<.> root folder.
652In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200653need to be passed with a hash sign in front of the archive's name.
654This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200655
Akron7606afa2016-10-25 16:23:49 +0200656To support zip files, a version of C<unzip> needs to be installed that is
657compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200658
Akron7606afa2016-10-25 16:23:49 +0200659B<The root folder switch using the hash sign is experimental and
660may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200661
Akron941c1a62016-02-23 17:41:41 +0100662=item B<--output|-o> <directory|file>
663
664Output folder for archive processing or
665document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100666writes to C<STDOUT> by default
667(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100668
669=item B<--overwrite|-w>
670
671Overwrite files that already exist.
672
673=item B<--token|-t> <foundry>[#<file>]
674
675Define the default tokenization by specifying
676the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100677of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100678
679=item B<--skip|-s> <foundry>[#<layer>]
680
Akronf7ad89e2016-03-16 18:22:47 +0100681Skip specific annotations by specifying the foundry
682(and optionally the layer with a C<#>-prefix),
683e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100684Can be set multiple times.
685
Akronc13a1702016-03-15 19:33:14 +0100686=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100687
Akronf7ad89e2016-03-16 18:22:47 +0100688Convert specific annotations by specifying the foundry
689(and optionally the layer with a C<#>-prefix),
690e.g. C<Mate> or C<Mate#Morpho>.
691Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100692
693=item B<--primary|-p>
694
Akronc13a1702016-03-15 19:33:14 +0100695Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100696Can be flagged using C<--no-primary> as well.
697This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100698
699=item B<--jobs|-j>
700
701Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100702for archive processing.
Akron11c80302016-03-18 19:44:43 +0100703Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100704This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100705
Akron35db6e32016-03-17 22:42:22 +0100706=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100707
Akron35db6e32016-03-17 22:42:22 +0100708Define the metadata parser to use. Defaults to C<I5>.
709Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
710This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100711
712=item B<--pretty|-y>
713
Akronc13a1702016-03-15 19:33:14 +0100714Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100715This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100716
717=item B<--gzip|-z>
718
Akronf7ad89e2016-03-16 18:22:47 +0100719Compress the output.
720Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100721
Akron11c80302016-03-18 19:44:43 +0100722=item B<--cache|-c>
723
724File to mmap a cache (using L<Cache::FastMmap>).
725Defaults to C<korapxml2krill.cache> in the calling directory.
726
727=item B<--cache-size|-cs>
728
729Size of the cache. Defaults to C<50m>.
730
731=item B<--cache-init|-ci>
732
733Initialize cache file.
734Can be flagged using C<--no-cache-init> as well.
735Defaults to C<true>.
736
737=item B<--cache-delete|-cd>
738
739Delete cache file after processing.
740Can be flagged using C<--no-cache-delete> as well.
741Defaults to C<true>.
742
Akrone10ad322016-02-27 10:54:26 +0100743=item B<--sigle|-sg>
744
Akron20807582016-10-26 17:11:34 +0200745Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100746Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100747I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200748Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200749In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200750On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100751
Akron941c1a62016-02-23 17:41:41 +0100752=item B<--log|-l>
753
754The L<Log4perl> log level, defaults to C<ERROR>.
755
756=item B<--help|-h>
757
758Print this document.
759
760=item B<--version|-v>
761
762Print version information.
763
764=back
765
Akronc13a1702016-03-15 19:33:14 +0100766=head1 ANNOTATION SUPPORT
767
768L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
769developed in the KorAP project that are part of the KorAP preprocessing pipeline.
770The base foundry with paragraphs, sentences, and the text element are mandatory for
771L<Krill|https://github.com/KorAP/Krill>.
772
Akronf7ad89e2016-03-16 18:22:47 +0100773=over 2
Akronc13a1702016-03-15 19:33:14 +0100774
775=item B<Base>
776
777=over 4
778
Akronf7ad89e2016-03-16 18:22:47 +0100779=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100780
Akronf7ad89e2016-03-16 18:22:47 +0100781=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100782
783=back
784
785=item B<Connexor>
786
787=over 4
788
Akronf7ad89e2016-03-16 18:22:47 +0100789=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100790
Akronf7ad89e2016-03-16 18:22:47 +0100791=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100792
Akronf7ad89e2016-03-16 18:22:47 +0100793=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100794
Akronf7ad89e2016-03-16 18:22:47 +0100795=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100796
797=back
798
799=item B<CoreNLP>
800
801=over 4
802
Akronf7ad89e2016-03-16 18:22:47 +0100803=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100804
Akronf7ad89e2016-03-16 18:22:47 +0100805=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100806
Akronf7ad89e2016-03-16 18:22:47 +0100807=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100808
Akronf7ad89e2016-03-16 18:22:47 +0100809=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100810
811=back
812
813=item B<DeReKo>
814
815=over 4
816
Akronf7ad89e2016-03-16 18:22:47 +0100817=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100818
819=back
820
821=item B<Glemm>
822
823=over 4
824
Akronf7ad89e2016-03-16 18:22:47 +0100825=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100826
827=back
828
829=item B<Mate>
830
831=over 4
832
Akronf7ad89e2016-03-16 18:22:47 +0100833=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100834
Akronf7ad89e2016-03-16 18:22:47 +0100835=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100836
837=back
838
839=item B<OpenNLP>
840
841=over 4
842
Akronf7ad89e2016-03-16 18:22:47 +0100843=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100844
Akronf7ad89e2016-03-16 18:22:47 +0100845=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100846
847=back
848
849=item B<Sgbr>
850
851=over 4
852
Akronf7ad89e2016-03-16 18:22:47 +0100853=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100854
Akronf7ad89e2016-03-16 18:22:47 +0100855=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100856
857=back
858
859=item B<TreeTagger>
860
861=over 4
862
Akronf7ad89e2016-03-16 18:22:47 +0100863=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100864
Akronf7ad89e2016-03-16 18:22:47 +0100865=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100866
867=back
868
869=item B<XIP>
870
871=over 4
872
Akronf7ad89e2016-03-16 18:22:47 +0100873=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100874
Akronf7ad89e2016-03-16 18:22:47 +0100875=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100876
Akronf7ad89e2016-03-16 18:22:47 +0100877=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100878
879=back
880
881=back
882
883More importers are in preparation.
884New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
885See the built-in annotation importers as examples.
886
Akron941c1a62016-02-23 17:41:41 +0100887=head1 AVAILABILITY
888
889 https://github.com/KorAP/KorAP-XML-Krill
890
891
892=head1 COPYRIGHT AND LICENSE
893
894Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100895
Akron941c1a62016-02-23 17:41:41 +0100896Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200897Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100898
899L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
900Corpus Analysis Platform at the
901L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
902member of the
903L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
904
905This program is free software published under the
906L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
907
908=cut