blob: 81497fe69cea8c3b418023e522f548694c578719 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron3741f8b2016-12-21 19:55:21 +010076# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020077# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020078#
Akron3741f8b2016-12-21 19:55:21 +010079# 2016/12/21
80# - added support for base-sentences and base-tokenizations
81#
Akron4fa37c32017-01-20 14:43:10 +010082# 2017/01/20
83# - added support for DRuKoLa annotations
84#
Akron941c1a62016-02-23 17:41:41 +010085# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010086
Akron4fa37c32017-01-20 14:43:10 +010087our $LAST_CHANGE = '2017/01/20';
Akron941c1a62016-02-23 17:41:41 +010088our $LOCAL = $FindBin::Bin;
89our $VERSION_MSG = <<"VERSION";
90Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
91VERSION
92
Akron941c1a62016-02-23 17:41:41 +010093# Parse comand
94my $cmd;
95our @ARGV;
96if ($ARGV[0] && index($ARGV[0], '-') != 0) {
97 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010098};
Akron93d620e2016-02-05 19:40:05 +010099
Akron5f51d422016-08-16 16:26:43 +0200100my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100101my $text;
Akrone10ad322016-02-27 10:54:26 +0100102
Akron941c1a62016-02-23 17:41:41 +0100103# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000104GetOptions(
Akron08385f62016-03-22 20:37:04 +0100105 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100106 'output|o=s' => \(my $output),
107 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100108 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200109 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron3741f8b2016-12-21 19:55:21 +0100110 'base-sentences|bs=s' => \(my $base_sentences = ''),
111 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
Akron941c1a62016-02-23 17:41:41 +0100112 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100113 'skip|s=s' => \@skip,
114 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100115 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100116 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200117 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100118 'primary|p!' => \(my $primary),
119 'pretty|y' => \(my $pretty),
120 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200121 'cache-size|cs=s' => \(my $cache_size = '50m'),
122 'cache-delete|cd!' => \(my $cache_delete = 1),
123 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100124 'help|h' => sub {
125 pod2usage(
126 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200127 -verbose => 99,
128 -msg => $VERSION_MSG,
129 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100130 );
131 },
132 'version|v' => sub {
133 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200134 -verbose => 0,
135 -msg => $VERSION_MSG,
136 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100137 )
138 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000139);
140
Akron3741f8b2016-12-21 19:55:21 +0100141$base_sentences = lc $base_sentences;
142$base_paragraphs = lc $base_paragraphs;
143
Akron941c1a62016-02-23 17:41:41 +0100144my %ERROR_HASH = (
145 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200146 -verbose => 99,
147 -msg => $VERSION_MSG,
148 -output => '-',
149 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100150);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000151
Akron941c1a62016-02-23 17:41:41 +0100152# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100153pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000154
Akrone1dbc382016-07-08 22:24:52 +0200155# Gzip has no effect, if no output is given
156pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000157
Akron941c1a62016-02-23 17:41:41 +0100158# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000159Log::Log4perl->init({
160 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
161 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
162 'log4perl.appender.STDERR.layout' => 'PatternLayout',
163 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
164});
165
166my $log = Log::Log4perl->get_logger('main');
167
Akrone1dbc382016-07-08 22:24:52 +0200168my %skip;
169$skip{lc($_)} = 1 foreach @skip;
170
171my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100172push(@layers, ['Base', 'Sentences']) unless $base_sentences;
173push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200174
175# Connexor
176push(@layers, ['Connexor', 'Morpho']);
177push(@layers, ['Connexor', 'Syntax']);
178push(@layers, ['Connexor', 'Phrase']);
179push(@layers, ['Connexor', 'Sentences']);
180
181# CoreNLP
182push(@layers, ['CoreNLP', 'NamedEntities']);
183push(@layers, ['CoreNLP', 'Sentences']);
184push(@layers, ['CoreNLP', 'Morpho']);
185push(@layers, ['CoreNLP', 'Constituency']);
186
Akron3741f8b2016-12-21 19:55:21 +0100187
Akrone1dbc382016-07-08 22:24:52 +0200188# DeReKo
Akron3741f8b2016-12-21 19:55:21 +0100189if ($base_sentences eq 'dereko#structure' && $base_paragraphs eq 'dereko#structure') {
190 push(@layers, ['DeReKo', 'Structure', 'base-sentences-paragraphs']);
191}
192elsif ($base_sentences eq 'dereko#structure') {
193 push(@layers, ['DeReKo', 'Structure', 'base-sentences']);
194}
195elsif ($base_paragraphs eq 'dereko#structure') {
196 push(@layers, ['DeReKo', 'Structure', 'base-paragraphs']);
197}
198else {
199 push(@layers, ['DeReKo', 'Structure']);
200};
Akrone1dbc382016-07-08 22:24:52 +0200201
202# Glemm
203push(@layers, ['Glemm', 'Morpho']);
204
205# Malt
206push(@layers, ['Malt', 'Dependency']);
207
208# MDParser
209push(@layers, ['MDParser', 'Dependency']);
210
211# Mate
212push(@layers, ['Mate', 'Morpho']);
213push(@layers, ['Mate', 'Dependency']);
214
215# OpenNLP
216push(@layers, ['OpenNLP', 'Morpho']);
217push(@layers, ['OpenNLP', 'Sentences']);
218
219# Schreibgebrauch
220push(@layers, ['Sgbr', 'Lemma']);
221push(@layers, ['Sgbr', 'Morpho']);
222
223# TreeTagger
224push(@layers, ['TreeTagger', 'Morpho']);
225push(@layers, ['TreeTagger', 'Sentences']);
226
227# XIP
228push(@layers, ['XIP', 'Morpho']);
229push(@layers, ['XIP', 'Constituency']);
230push(@layers, ['XIP', 'Sentences']);
231push(@layers, ['XIP', 'Dependency']);
232
Akron4fa37c32017-01-20 14:43:10 +0100233# DRuKoLa
234push(@layers, ['DRuKoLa', 'Morpho']);
235
236
Akrone1dbc382016-07-08 22:24:52 +0200237# Check filters
238my @filtered_anno;
239if ($skip{'#all'}) {
240 foreach (@anno) {
241 push @filtered_anno, [ split('#', $_) ];
242 };
243}
244
245# Add all annotations that are not skipped
246else {
247 # Add to index file - respect skipping
248 foreach my $info (@layers) {
249 # Skip if Foundry or Foundry#Layer should be skipped
250 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
251 push @filtered_anno, $info;
252 };
253 };
254};
255
256# Get tokenization basis
257my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
258
259# TODO: This should not be initialized for batch
260my $cache = Cache::FastMmap->new(
261 share_file => $cache_file,
262 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200263 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200264);
265
Akron03b24db2016-08-16 20:54:32 +0200266# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200267my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200268 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200269 meta_type => $meta,
270 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200271 foundry => $token_base_foundry,
272 layer => $token_base_layer,
273 gzip => $gzip,
274 log => $log,
275 primary => $primary,
276 pretty => $pretty,
277 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200278);
279
Akron941c1a62016-02-23 17:41:41 +0100280
281# Get file name based on path information
282sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100283 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200284 if (-d $i) {
285 $i =~ s![^\/]+$!!;
286 };
Akron941c1a62016-02-23 17:41:41 +0100287 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200288
289 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200290 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100291 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100292 $file =~ tr/\//-/;
293 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200294 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100295 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000296};
297
Akrone10ad322016-02-27 10:54:26 +0100298# Convert sigle to path construct
299s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
300
Akron7d4cdd82016-08-17 21:39:45 +0200301if ($cmd) {
302 if ($output && (!-e $output || !-d $output)) {
303 print "Directory '$output' does not exist.\n\n";
304 exit(0);
305 };
306};
307
308
Akron941c1a62016-02-23 17:41:41 +0100309# Process a single file
310unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100311 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000312
Akron941c1a62016-02-23 17:41:41 +0100313 BEGIN {
314 $main::TIME = Benchmark->new;
315 $main::LAST_STOP = Benchmark->new;
316 };
317
318 sub stop_time {
319 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200320 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100321 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200322 timestr(timediff($new, $main::LAST_STOP)) .
323 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
324 );
Akron941c1a62016-02-23 17:41:41 +0100325 $main::LAST_STOP = $new;
326 };
327
328 # Create and parse new document
329 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100330
Akron7d4cdd82016-08-17 21:39:45 +0200331 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200332 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100333
Akron11c80302016-03-18 19:44:43 +0100334 # Delete cache file
335 unlink($cache_file) if $cache_delete;
336
Akron5f51d422016-08-16 16:26:43 +0200337 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000338}
Nils Diewald59094f22014-11-05 18:20:50 +0000339
Akrone10ad322016-02-27 10:54:26 +0100340# Extract XML files
341elsif ($cmd eq 'extract') {
342
Akron7d4cdd82016-08-17 21:39:45 +0200343 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200344 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100345
Akron7d4cdd82016-08-17 21:39:45 +0200346 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100347 unless ($archive->test_unzip) {
348 print "Unzip is not installed or incompatible.\n\n";
349 exit(1);
350 };
351
Akronb0c88db2016-06-29 16:33:18 +0200352 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200353 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200354
Akron651cb8d2016-08-16 21:44:49 +0200355 my $prefix = 1;
356
Akron03b24db2016-08-16 20:54:32 +0200357 # No sigles given
358 unless (@sigle) {
359
360 # Get files
361 foreach ($archive->list_texts) {
362
363 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200364 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200365
366 # TODO: Make this OS independent
367 push @sigle, join '/', $corpus, $doc, $text;
368 };
Akron20807582016-10-26 17:11:34 +0200369 }
370
371 # Check sigle for doc sigles
372 else {
373 my @new_sigle;
374
375 my $prefix_check = 0;
376
377 # Iterate over all sigle
378 foreach (@sigle) {
379
380 # Sigle is a doc sigle
381 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200382
Akron2812ba22016-10-28 21:55:59 +0200383 print "$_ ...\n";
Akron20807582016-10-26 17:11:34 +0200384 # Check if a prefix is needed
385 unless ($prefix_check) {
386 $prefix = $archive->check_prefix;
387 $prefix_check = 1;
388 };
389
390 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200391 my $path = ($prefix ? './' : '') . $_;
392
393 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200394 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200395 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200396 ) ? '' : 'not '
397 );
398 print "extracted.\n";
399 }
400 else {
401 push @new_sigle, $_;
402 };
403 };
404 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200405 };
406
Akrone10ad322016-02-27 10:54:26 +0100407 # Iterate over all given sigles and extract
408 foreach (@sigle) {
Akron2812ba22016-10-28 21:55:59 +0200409 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200410
Akron03b24db2016-08-16 20:54:32 +0200411 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200412 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200413 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200414 ($prefix ? './' : '') . $_, $output
415 ) ? '' : 'not '
416 );
Akrone10ad322016-02-27 10:54:26 +0100417 print "extracted.\n";
418 };
419
420 print "\n";
421 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200422 }
Akron7d4cdd82016-08-17 21:39:45 +0200423
424 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200425 else {
426 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100427 };
428}
429
Akron941c1a62016-02-23 17:41:41 +0100430# Process an archive
431elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000432
Akrone1dbc382016-07-08 22:24:52 +0200433 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100434
Akron7d4cdd82016-08-17 21:39:45 +0200435 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100436 my $pool = Parallel::ForkManager->new($jobs);
437
Akron7d4cdd82016-08-17 21:39:45 +0200438 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100439 my $iter = 1; # Current text in process
440
441 # Report on fork message
442 $pool->run_on_finish (
443 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200444 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100445 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200446
Akron08385f62016-03-22 20:37:04 +0100447 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200448 ($iter++) . "/$count]" .
449 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200450 ' ' . $data->[0] . "\n";
451 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100452 }
453 );
454
455 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200456 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100457 print "Reading data ...\n";
458
Akron7d4cdd82016-08-17 21:39:45 +0200459 # unless (Cache::FastMmap->new(
460 # share_file => $cache_file,
461 # cache_size => $cache_size,
462 # init_file => $cache_init
463 # )) {
464 # print "Unable to intialize cache '$cache_file'\n\n";
465 # exit(1);
466 # };
Akron11c80302016-03-18 19:44:43 +0100467
Akron941c1a62016-02-23 17:41:41 +0100468 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100469 if (-d $input[0]) {
470 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100471 my @dirs;
472 my $dir;
473
Akron7d4cdd82016-08-17 21:39:45 +0200474 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100475 while (1) {
476 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200477 push @dirs, $dir;
478 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100479 };
480 last unless $it->next;
481 };
482
483 print "Start processing ...\n";
484 $t = Benchmark->new;
485 $count = scalar @dirs;
486
487 DIRECTORY_LOOP:
488 for (my $i = 0; $i < $count; $i++) {
489
Akrone1dbc382016-07-08 22:24:52 +0200490 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200491 $output,
492 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200493 );
Akron941c1a62016-02-23 17:41:41 +0100494
495 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200496 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200497
Akron13d56622016-10-31 14:54:49 +0100498 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
499 $pool->finish(
500 0,
501 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
502 );
Akron3ec48972016-08-17 23:24:52 +0200503 }
504 else {
Akron4c0cf312016-10-15 16:42:09 +0200505 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200506 };
Akron941c1a62016-02-23 17:41:41 +0100507 };
508 }
509
510 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200511 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200512
Akron941c1a62016-02-23 17:41:41 +0100513 unless ($archive->test_unzip) {
514 print "Unzip is not installed or incompatible.\n\n";
515 exit(1);
516 };
517
Akron08385f62016-03-22 20:37:04 +0100518 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200519 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100520
Akron941c1a62016-02-23 17:41:41 +0100521 print "Start processing ...\n";
522 $t = Benchmark->new;
523 my @dirs = $archive->list_texts;
524 $count = scalar @dirs;
525
526 ARCHIVE_LOOP:
527 for (my $i = 0; $i < $count; $i++) {
528
529 # Split path information
530 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
531
Akrone1dbc382016-07-08 22:24:52 +0200532 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200533 $output,
534 get_file_name(
535 catfile($corpus, $doc, $text)
536 . '.json' . ($gzip ? '.gz' : '')
537 )
Akrone1dbc382016-07-08 22:24:52 +0200538 );
Akron941c1a62016-02-23 17:41:41 +0100539
540 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200541 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100542
Akron4c0cf312016-10-15 16:42:09 +0200543 # Create temporary file
544 $temp = File::Temp->newdir;
545
Akronbdf434a2016-10-24 17:42:07 +0200546 # TODO: Check if $filename exist at the beginning,
547 # because extraction can be horrible slow!
548
Akron941c1a62016-02-23 17:41:41 +0100549 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200550 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100551
Akron7d4cdd82016-08-17 21:39:45 +0200552 # Create corpus directory
553 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100554
Akron7d4cdd82016-08-17 21:39:45 +0200555 # Temporary directory
556 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100557
Akron7d4cdd82016-08-17 21:39:45 +0200558 # Write file
Akron13d56622016-10-31 14:54:49 +0100559 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200560 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100561 $pool->finish(
562 0,
563 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
564 );
565 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200566 }
567 else {
Akron4c0cf312016-10-15 16:42:09 +0200568 # Delete temporary file
569 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200570 };
Akron941c1a62016-02-23 17:41:41 +0100571 }
Akron7d4cdd82016-08-17 21:39:45 +0200572
573 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100574 else {
Akron4c0cf312016-10-15 16:42:09 +0200575 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100576 };
577 };
578 }
579
580 else {
581 print "Input is neither a directory nor an archive.\n\n";
582 };
583
584 $pool->wait_all_children;
585
Akron11c80302016-03-18 19:44:43 +0100586 # Delete cache file
587 unlink($cache_file) if $cache_delete;
588
Akron941c1a62016-02-23 17:41:41 +0100589 print "Done.\n";
590 print timestr(timediff(Benchmark->new, $t))."\n\n";
591}
592
593# Unknown command
594else {
595 warn "Unknown command '$cmd'.\n\n";
596 pod2usage(%ERROR_HASH);
597}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000598
599__END__
Akron941c1a62016-02-23 17:41:41 +0100600
601=pod
602
603=encoding utf8
604
605=head1 NAME
606
Akronf7ad89e2016-03-16 18:22:47 +0100607korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100608
609
610=head1 SYNOPSIS
611
Akrona76d8352016-10-27 16:27:32 +0200612 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100613
Akron2fd402b2016-10-27 21:26:48 +0200614
Akron941c1a62016-02-23 17:41:41 +0100615=head1 DESCRIPTION
616
617L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
618compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100619The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100620
621
622=head1 INSTALLATION
623
624The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
625
Akronaf386982016-10-12 00:33:25 +0200626 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100627
Akronc13a1702016-03-15 19:33:14 +0100628In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100629be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200630Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200631In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100632
633=head1 ARGUMENTS
634
Akrona76d8352016-10-27 16:27:32 +0200635 $ korapxml2krill -z --input <directory> --output <filename>
636
637Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200638It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200639
Akron941c1a62016-02-23 17:41:41 +0100640=over 2
641
642=item B<archive>
643
Akrona76d8352016-10-27 16:27:32 +0200644 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
645
Akron2fd402b2016-10-27 21:26:48 +0200646Converts an archive of KorAP-XML documents. It expects a directory
647(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100648
649=item B<extract>
650
Akrona76d8352016-10-27 16:27:32 +0200651 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
652
653Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100654
655=back
656
657
658=head1 OPTIONS
659
660=over 2
661
Akrona76d8352016-10-27 16:27:32 +0200662=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100663
Akrona76d8352016-10-27 16:27:32 +0200664Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100665
Akron7606afa2016-10-25 16:23:49 +0200666Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100667document, while C<archive> expects a KorAP-XML corpus folder or a zip
668file to batch process multiple files.
669C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200670
Akrona76d8352016-10-27 16:27:32 +0200671C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200672that the first archive listed contains all primary data files
673and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200674
Akron7606afa2016-10-25 16:23:49 +0200675 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200676
Akron0c3e3752016-06-28 15:55:53 +0200677(The directory structure follows the base directory format,
678that may include a C<.> root folder.
679In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200680need to be passed with a hash sign in front of the archive's name.
681This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200682
Akron7606afa2016-10-25 16:23:49 +0200683To support zip files, a version of C<unzip> needs to be installed that is
684compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200685
Akron7606afa2016-10-25 16:23:49 +0200686B<The root folder switch using the hash sign is experimental and
687may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200688
Akron941c1a62016-02-23 17:41:41 +0100689=item B<--output|-o> <directory|file>
690
691Output folder for archive processing or
692document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100693writes to C<STDOUT> by default
694(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100695
696=item B<--overwrite|-w>
697
698Overwrite files that already exist.
699
Akron3741f8b2016-12-21 19:55:21 +0100700=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100701
702Define the default tokenization by specifying
703the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100704of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100705
Akron3741f8b2016-12-21 19:55:21 +0100706
707=item B<--base-sentences|-bs> <foundry>#<layer>
708
709Define the layer for base sentences.
710If given, this will be used instead of using C<Base#Sentences>.
711Currently C<DeReKo#Structure> is the only additional layer supported.
712
713 Defaults to unset.
714
715
716=item B<--base-paragraphs|-bp> <foundry>#<layer>
717
718Define the layer for base paragraphs.
719If given, this will be used instead of using C<Base#Paragraphs>.
720Currently C<DeReKo#Structure> is the only additional layer supported.
721
722 Defaults to unset.
723
724
Akron941c1a62016-02-23 17:41:41 +0100725=item B<--skip|-s> <foundry>[#<layer>]
726
Akronf7ad89e2016-03-16 18:22:47 +0100727Skip specific annotations by specifying the foundry
728(and optionally the layer with a C<#>-prefix),
729e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100730Can be set multiple times.
731
Akronc13a1702016-03-15 19:33:14 +0100732=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100733
Akronf7ad89e2016-03-16 18:22:47 +0100734Convert specific annotations by specifying the foundry
735(and optionally the layer with a C<#>-prefix),
736e.g. C<Mate> or C<Mate#Morpho>.
737Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100738
739=item B<--primary|-p>
740
Akronc13a1702016-03-15 19:33:14 +0100741Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100742Can be flagged using C<--no-primary> as well.
743This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100744
745=item B<--jobs|-j>
746
747Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100748for archive processing.
Akron11c80302016-03-18 19:44:43 +0100749Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100750This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100751
Akron35db6e32016-03-17 22:42:22 +0100752=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100753
Akron35db6e32016-03-17 22:42:22 +0100754Define the metadata parser to use. Defaults to C<I5>.
755Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
756This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100757
758=item B<--pretty|-y>
759
Akronc13a1702016-03-15 19:33:14 +0100760Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100761This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100762
763=item B<--gzip|-z>
764
Akronf7ad89e2016-03-16 18:22:47 +0100765Compress the output.
766Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100767
Akron11c80302016-03-18 19:44:43 +0100768=item B<--cache|-c>
769
770File to mmap a cache (using L<Cache::FastMmap>).
771Defaults to C<korapxml2krill.cache> in the calling directory.
772
773=item B<--cache-size|-cs>
774
775Size of the cache. Defaults to C<50m>.
776
777=item B<--cache-init|-ci>
778
779Initialize cache file.
780Can be flagged using C<--no-cache-init> as well.
781Defaults to C<true>.
782
783=item B<--cache-delete|-cd>
784
785Delete cache file after processing.
786Can be flagged using C<--no-cache-delete> as well.
787Defaults to C<true>.
788
Akrone10ad322016-02-27 10:54:26 +0100789=item B<--sigle|-sg>
790
Akron20807582016-10-26 17:11:34 +0200791Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100792Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100793I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200794Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200795In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200796On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100797
Akron941c1a62016-02-23 17:41:41 +0100798=item B<--log|-l>
799
800The L<Log4perl> log level, defaults to C<ERROR>.
801
802=item B<--help|-h>
803
804Print this document.
805
806=item B<--version|-v>
807
808Print version information.
809
810=back
811
Akronc13a1702016-03-15 19:33:14 +0100812=head1 ANNOTATION SUPPORT
813
814L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
815developed in the KorAP project that are part of the KorAP preprocessing pipeline.
816The base foundry with paragraphs, sentences, and the text element are mandatory for
817L<Krill|https://github.com/KorAP/Krill>.
818
Akronf7ad89e2016-03-16 18:22:47 +0100819=over 2
Akronc13a1702016-03-15 19:33:14 +0100820
821=item B<Base>
822
823=over 4
824
Akronf7ad89e2016-03-16 18:22:47 +0100825=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100826
Akronf7ad89e2016-03-16 18:22:47 +0100827=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100828
829=back
830
831=item B<Connexor>
832
833=over 4
834
Akronf7ad89e2016-03-16 18:22:47 +0100835=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100836
Akronf7ad89e2016-03-16 18:22:47 +0100837=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100838
Akronf7ad89e2016-03-16 18:22:47 +0100839=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100840
Akronf7ad89e2016-03-16 18:22:47 +0100841=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100842
843=back
844
845=item B<CoreNLP>
846
847=over 4
848
Akronf7ad89e2016-03-16 18:22:47 +0100849=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100850
Akronf7ad89e2016-03-16 18:22:47 +0100851=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100852
Akronf7ad89e2016-03-16 18:22:47 +0100853=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100854
Akronf7ad89e2016-03-16 18:22:47 +0100855=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100856
857=back
858
859=item B<DeReKo>
860
861=over 4
862
Akronf7ad89e2016-03-16 18:22:47 +0100863=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100864
865=back
866
867=item B<Glemm>
868
869=over 4
870
Akronf7ad89e2016-03-16 18:22:47 +0100871=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100872
873=back
874
875=item B<Mate>
876
877=over 4
878
Akronf7ad89e2016-03-16 18:22:47 +0100879=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100880
Akronf7ad89e2016-03-16 18:22:47 +0100881=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100882
883=back
884
885=item B<OpenNLP>
886
887=over 4
888
Akronf7ad89e2016-03-16 18:22:47 +0100889=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100890
Akronf7ad89e2016-03-16 18:22:47 +0100891=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100892
893=back
894
895=item B<Sgbr>
896
897=over 4
898
Akronf7ad89e2016-03-16 18:22:47 +0100899=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100900
Akronf7ad89e2016-03-16 18:22:47 +0100901=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100902
903=back
904
Akron4fa37c32017-01-20 14:43:10 +0100905=item B<DRuKoLa>
906
907=over 4
908
909=item #Morpho
910
911=back
912
Akronc13a1702016-03-15 19:33:14 +0100913=item B<TreeTagger>
914
915=over 4
916
Akronf7ad89e2016-03-16 18:22:47 +0100917=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100918
Akronf7ad89e2016-03-16 18:22:47 +0100919=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100920
921=back
922
923=item B<XIP>
924
925=over 4
926
Akronf7ad89e2016-03-16 18:22:47 +0100927=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100928
Akronf7ad89e2016-03-16 18:22:47 +0100929=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100930
Akronf7ad89e2016-03-16 18:22:47 +0100931=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100932
933=back
934
935=back
936
937More importers are in preparation.
938New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
939See the built-in annotation importers as examples.
940
Akron941c1a62016-02-23 17:41:41 +0100941=head1 AVAILABILITY
942
943 https://github.com/KorAP/KorAP-XML-Krill
944
945
946=head1 COPYRIGHT AND LICENSE
947
Akron3ec0a1c2017-01-18 14:41:55 +0100948Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100949
Akron941c1a62016-02-23 17:41:41 +0100950Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200951Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100952
953L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
954Corpus Analysis Platform at the
955L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
956member of the
957L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
958
959This program is free software published under the
960L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
961
962=cut