blob: 43ac47a92ee0267d0cdd426e60452e2d07faf334 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron3741f8b2016-12-21 19:55:21 +010076# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020077# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020078#
Akron3741f8b2016-12-21 19:55:21 +010079# 2016/12/21
80# - added support for base-sentences and base-tokenizations
81#
Akron4fa37c32017-01-20 14:43:10 +010082# 2017/01/20
83# - added support for DRuKoLa annotations
84#
Akron41ac10b2017-02-08 22:47:25 +010085# 2017/02/08
86# - added support for pagebreak annotations
87#
Akron941c1a62016-02-23 17:41:41 +010088# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010089
Akron41ac10b2017-02-08 22:47:25 +010090our $LAST_CHANGE = '2017/02/08';
Akron941c1a62016-02-23 17:41:41 +010091our $LOCAL = $FindBin::Bin;
92our $VERSION_MSG = <<"VERSION";
93Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
94VERSION
95
Akron941c1a62016-02-23 17:41:41 +010096# Parse comand
97my $cmd;
98our @ARGV;
99if ($ARGV[0] && index($ARGV[0], '-') != 0) {
100 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100101};
Akron93d620e2016-02-05 19:40:05 +0100102
Akron5f51d422016-08-16 16:26:43 +0200103my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100104my $text;
Akrone10ad322016-02-27 10:54:26 +0100105
Akron941c1a62016-02-23 17:41:41 +0100106# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107GetOptions(
Akron08385f62016-03-22 20:37:04 +0100108 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100109 'output|o=s' => \(my $output),
110 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100111 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200112 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron3741f8b2016-12-21 19:55:21 +0100113 'base-sentences|bs=s' => \(my $base_sentences = ''),
114 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
Akron41ac10b2017-02-08 22:47:25 +0100115 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
Akron941c1a62016-02-23 17:41:41 +0100116 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100117 'skip|s=s' => \@skip,
118 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100119 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100120 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200121 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100122 'primary|p!' => \(my $primary),
123 'pretty|y' => \(my $pretty),
124 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200125 'cache-size|cs=s' => \(my $cache_size = '50m'),
126 'cache-delete|cd!' => \(my $cache_delete = 1),
127 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100128 'help|h' => sub {
129 pod2usage(
130 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200131 -verbose => 99,
132 -msg => $VERSION_MSG,
133 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100134 );
135 },
136 'version|v' => sub {
137 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200138 -verbose => 0,
139 -msg => $VERSION_MSG,
140 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100141 )
142 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143);
144
Akron3741f8b2016-12-21 19:55:21 +0100145$base_sentences = lc $base_sentences;
146$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100147$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100148
Akron941c1a62016-02-23 17:41:41 +0100149my %ERROR_HASH = (
150 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200151 -verbose => 99,
152 -msg => $VERSION_MSG,
153 -output => '-',
154 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100155);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000156
Akron941c1a62016-02-23 17:41:41 +0100157# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100158pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000159
Akrone1dbc382016-07-08 22:24:52 +0200160# Gzip has no effect, if no output is given
161pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000162
Akron941c1a62016-02-23 17:41:41 +0100163# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000164Log::Log4perl->init({
165 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
166 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
167 'log4perl.appender.STDERR.layout' => 'PatternLayout',
168 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
169});
170
171my $log = Log::Log4perl->get_logger('main');
172
Akrone1dbc382016-07-08 22:24:52 +0200173my %skip;
174$skip{lc($_)} = 1 foreach @skip;
175
176my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100177push(@layers, ['Base', 'Sentences']) unless $base_sentences;
178push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200179
180# Connexor
181push(@layers, ['Connexor', 'Morpho']);
182push(@layers, ['Connexor', 'Syntax']);
183push(@layers, ['Connexor', 'Phrase']);
184push(@layers, ['Connexor', 'Sentences']);
185
186# CoreNLP
187push(@layers, ['CoreNLP', 'NamedEntities']);
188push(@layers, ['CoreNLP', 'Sentences']);
189push(@layers, ['CoreNLP', 'Morpho']);
190push(@layers, ['CoreNLP', 'Constituency']);
191
Akron3741f8b2016-12-21 19:55:21 +0100192
Akrone1dbc382016-07-08 22:24:52 +0200193# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100194my @dereko_attr = ();
195if ($base_sentences eq 'dereko#structure') {
196 push @dereko_attr, 'sentences';
197};
198if ($base_paragraphs eq 'dereko#structure') {
199 push @dereko_attr, 'paragraphs';
200};
Akron636bd9c2017-02-09 17:13:00 +0100201
Akron41ac10b2017-02-08 22:47:25 +0100202if ($base_pagebreaks eq 'dereko#structure') {
203 push @dereko_attr, 'pagebreaks';
204};
205
206if ($dereko_attr[0]) {
207 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100208}
209else {
210 push(@layers, ['DeReKo', 'Structure']);
211};
Akrone1dbc382016-07-08 22:24:52 +0200212
213# Glemm
214push(@layers, ['Glemm', 'Morpho']);
215
216# Malt
217push(@layers, ['Malt', 'Dependency']);
218
219# MDParser
220push(@layers, ['MDParser', 'Dependency']);
221
222# Mate
223push(@layers, ['Mate', 'Morpho']);
224push(@layers, ['Mate', 'Dependency']);
225
226# OpenNLP
227push(@layers, ['OpenNLP', 'Morpho']);
228push(@layers, ['OpenNLP', 'Sentences']);
229
230# Schreibgebrauch
231push(@layers, ['Sgbr', 'Lemma']);
232push(@layers, ['Sgbr', 'Morpho']);
233
234# TreeTagger
235push(@layers, ['TreeTagger', 'Morpho']);
236push(@layers, ['TreeTagger', 'Sentences']);
237
238# XIP
239push(@layers, ['XIP', 'Morpho']);
240push(@layers, ['XIP', 'Constituency']);
241push(@layers, ['XIP', 'Sentences']);
242push(@layers, ['XIP', 'Dependency']);
243
Akron4fa37c32017-01-20 14:43:10 +0100244# DRuKoLa
245push(@layers, ['DRuKoLa', 'Morpho']);
246
247
Akrone1dbc382016-07-08 22:24:52 +0200248# Check filters
249my @filtered_anno;
250if ($skip{'#all'}) {
251 foreach (@anno) {
252 push @filtered_anno, [ split('#', $_) ];
253 };
254}
255
256# Add all annotations that are not skipped
257else {
258 # Add to index file - respect skipping
259 foreach my $info (@layers) {
260 # Skip if Foundry or Foundry#Layer should be skipped
261 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
262 push @filtered_anno, $info;
263 };
264 };
265};
266
267# Get tokenization basis
268my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
269
270# TODO: This should not be initialized for batch
271my $cache = Cache::FastMmap->new(
272 share_file => $cache_file,
273 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200274 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200275);
276
Akron03b24db2016-08-16 20:54:32 +0200277# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200278my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200279 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200280 meta_type => $meta,
281 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200282 foundry => $token_base_foundry,
283 layer => $token_base_layer,
284 gzip => $gzip,
285 log => $log,
286 primary => $primary,
287 pretty => $pretty,
288 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200289);
290
Akron941c1a62016-02-23 17:41:41 +0100291
292# Get file name based on path information
293sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100294 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200295 if (-d $i) {
296 $i =~ s![^\/]+$!!;
297 };
Akron941c1a62016-02-23 17:41:41 +0100298 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200299
300 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200301 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100302 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100303 $file =~ tr/\//-/;
304 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200305 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100306 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000307};
308
Akrone10ad322016-02-27 10:54:26 +0100309# Convert sigle to path construct
310s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
311
Akron7d4cdd82016-08-17 21:39:45 +0200312if ($cmd) {
313 if ($output && (!-e $output || !-d $output)) {
314 print "Directory '$output' does not exist.\n\n";
315 exit(0);
316 };
317};
318
319
Akron941c1a62016-02-23 17:41:41 +0100320# Process a single file
321unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100322 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000323
Akron941c1a62016-02-23 17:41:41 +0100324 BEGIN {
325 $main::TIME = Benchmark->new;
326 $main::LAST_STOP = Benchmark->new;
327 };
328
329 sub stop_time {
330 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200331 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100332 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200333 timestr(timediff($new, $main::LAST_STOP)) .
334 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
335 );
Akron941c1a62016-02-23 17:41:41 +0100336 $main::LAST_STOP = $new;
337 };
338
339 # Create and parse new document
340 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100341
Akron7d4cdd82016-08-17 21:39:45 +0200342 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200343 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100344
Akron11c80302016-03-18 19:44:43 +0100345 # Delete cache file
346 unlink($cache_file) if $cache_delete;
347
Akron5f51d422016-08-16 16:26:43 +0200348 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000349}
Nils Diewald59094f22014-11-05 18:20:50 +0000350
Akrone10ad322016-02-27 10:54:26 +0100351# Extract XML files
352elsif ($cmd eq 'extract') {
353
Akron7d4cdd82016-08-17 21:39:45 +0200354 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200355 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100356
Akron7d4cdd82016-08-17 21:39:45 +0200357 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100358 unless ($archive->test_unzip) {
359 print "Unzip is not installed or incompatible.\n\n";
360 exit(1);
361 };
362
Akronb0c88db2016-06-29 16:33:18 +0200363 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200364 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200365
Akron651cb8d2016-08-16 21:44:49 +0200366 my $prefix = 1;
367
Akron03b24db2016-08-16 20:54:32 +0200368 # No sigles given
369 unless (@sigle) {
370
371 # Get files
372 foreach ($archive->list_texts) {
373
374 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200375 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200376
377 # TODO: Make this OS independent
378 push @sigle, join '/', $corpus, $doc, $text;
379 };
Akron20807582016-10-26 17:11:34 +0200380 }
381
382 # Check sigle for doc sigles
383 else {
384 my @new_sigle;
385
386 my $prefix_check = 0;
387
388 # Iterate over all sigle
389 foreach (@sigle) {
390
391 # Sigle is a doc sigle
392 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200393
Akron2812ba22016-10-28 21:55:59 +0200394 print "$_ ...\n";
Akron20807582016-10-26 17:11:34 +0200395 # Check if a prefix is needed
396 unless ($prefix_check) {
397 $prefix = $archive->check_prefix;
398 $prefix_check = 1;
399 };
400
401 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200402 my $path = ($prefix ? './' : '') . $_;
403
404 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200405 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200406 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200407 ) ? '' : 'not '
408 );
409 print "extracted.\n";
410 }
411 else {
412 push @new_sigle, $_;
413 };
414 };
415 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200416 };
417
Akrone10ad322016-02-27 10:54:26 +0100418 # Iterate over all given sigles and extract
419 foreach (@sigle) {
Akron2812ba22016-10-28 21:55:59 +0200420 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200421
Akron03b24db2016-08-16 20:54:32 +0200422 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200423 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200424 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200425 ($prefix ? './' : '') . $_, $output
426 ) ? '' : 'not '
427 );
Akrone10ad322016-02-27 10:54:26 +0100428 print "extracted.\n";
429 };
430
431 print "\n";
432 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200433 }
Akron7d4cdd82016-08-17 21:39:45 +0200434
435 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200436 else {
437 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100438 };
439}
440
Akron941c1a62016-02-23 17:41:41 +0100441# Process an archive
442elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000443
Akrone1dbc382016-07-08 22:24:52 +0200444 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100445
Akron7d4cdd82016-08-17 21:39:45 +0200446 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100447 my $pool = Parallel::ForkManager->new($jobs);
448
Akron7d4cdd82016-08-17 21:39:45 +0200449 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100450 my $iter = 1; # Current text in process
451
452 # Report on fork message
453 $pool->run_on_finish (
454 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200455 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100456 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200457
Akron08385f62016-03-22 20:37:04 +0100458 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200459 ($iter++) . "/$count]" .
460 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200461 ' ' . $data->[0] . "\n";
462 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100463 }
464 );
465
466 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200467 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100468 print "Reading data ...\n";
469
Akron7d4cdd82016-08-17 21:39:45 +0200470 # unless (Cache::FastMmap->new(
471 # share_file => $cache_file,
472 # cache_size => $cache_size,
473 # init_file => $cache_init
474 # )) {
475 # print "Unable to intialize cache '$cache_file'\n\n";
476 # exit(1);
477 # };
Akron11c80302016-03-18 19:44:43 +0100478
Akron941c1a62016-02-23 17:41:41 +0100479 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100480 if (-d $input[0]) {
481 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100482 my @dirs;
483 my $dir;
484
Akron7d4cdd82016-08-17 21:39:45 +0200485 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100486 while (1) {
487 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200488 push @dirs, $dir;
489 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100490 };
491 last unless $it->next;
492 };
493
494 print "Start processing ...\n";
495 $t = Benchmark->new;
496 $count = scalar @dirs;
497
498 DIRECTORY_LOOP:
499 for (my $i = 0; $i < $count; $i++) {
500
Akrone1dbc382016-07-08 22:24:52 +0200501 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200502 $output,
503 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200504 );
Akron941c1a62016-02-23 17:41:41 +0100505
506 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200507 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200508
Akron13d56622016-10-31 14:54:49 +0100509 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
510 $pool->finish(
511 0,
512 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
513 );
Akron3ec48972016-08-17 23:24:52 +0200514 }
515 else {
Akron4c0cf312016-10-15 16:42:09 +0200516 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200517 };
Akron941c1a62016-02-23 17:41:41 +0100518 };
519 }
520
521 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200522 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200523
Akron941c1a62016-02-23 17:41:41 +0100524 unless ($archive->test_unzip) {
525 print "Unzip is not installed or incompatible.\n\n";
526 exit(1);
527 };
528
Akron08385f62016-03-22 20:37:04 +0100529 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200530 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100531
Akron941c1a62016-02-23 17:41:41 +0100532 print "Start processing ...\n";
533 $t = Benchmark->new;
534 my @dirs = $archive->list_texts;
535 $count = scalar @dirs;
536
537 ARCHIVE_LOOP:
538 for (my $i = 0; $i < $count; $i++) {
539
540 # Split path information
541 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
542
Akrone1dbc382016-07-08 22:24:52 +0200543 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200544 $output,
545 get_file_name(
546 catfile($corpus, $doc, $text)
547 . '.json' . ($gzip ? '.gz' : '')
548 )
Akrone1dbc382016-07-08 22:24:52 +0200549 );
Akron941c1a62016-02-23 17:41:41 +0100550
551 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200552 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100553
Akron4c0cf312016-10-15 16:42:09 +0200554 # Create temporary file
555 $temp = File::Temp->newdir;
556
Akronbdf434a2016-10-24 17:42:07 +0200557 # TODO: Check if $filename exist at the beginning,
558 # because extraction can be horrible slow!
559
Akron941c1a62016-02-23 17:41:41 +0100560 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200561 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100562
Akron7d4cdd82016-08-17 21:39:45 +0200563 # Create corpus directory
564 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100565
Akron7d4cdd82016-08-17 21:39:45 +0200566 # Temporary directory
567 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100568
Akron7d4cdd82016-08-17 21:39:45 +0200569 # Write file
Akron13d56622016-10-31 14:54:49 +0100570 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200571 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100572 $pool->finish(
573 0,
574 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
575 );
576 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200577 }
578 else {
Akron4c0cf312016-10-15 16:42:09 +0200579 # Delete temporary file
580 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200581 };
Akron941c1a62016-02-23 17:41:41 +0100582 }
Akron7d4cdd82016-08-17 21:39:45 +0200583
584 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100585 else {
Akron4c0cf312016-10-15 16:42:09 +0200586 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100587 };
588 };
589 }
590
591 else {
592 print "Input is neither a directory nor an archive.\n\n";
593 };
594
595 $pool->wait_all_children;
596
Akron11c80302016-03-18 19:44:43 +0100597 # Delete cache file
598 unlink($cache_file) if $cache_delete;
599
Akron941c1a62016-02-23 17:41:41 +0100600 print "Done.\n";
601 print timestr(timediff(Benchmark->new, $t))."\n\n";
602}
603
604# Unknown command
605else {
606 warn "Unknown command '$cmd'.\n\n";
607 pod2usage(%ERROR_HASH);
608}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000609
610__END__
Akron941c1a62016-02-23 17:41:41 +0100611
612=pod
613
614=encoding utf8
615
616=head1 NAME
617
Akronf7ad89e2016-03-16 18:22:47 +0100618korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100619
620
621=head1 SYNOPSIS
622
Akrona76d8352016-10-27 16:27:32 +0200623 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100624
Akron2fd402b2016-10-27 21:26:48 +0200625
Akron941c1a62016-02-23 17:41:41 +0100626=head1 DESCRIPTION
627
628L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
629compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100630The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100631
632
633=head1 INSTALLATION
634
635The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
636
Akronaf386982016-10-12 00:33:25 +0200637 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100638
Akronc13a1702016-03-15 19:33:14 +0100639In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100640be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200641Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200642In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100643
644=head1 ARGUMENTS
645
Akrona76d8352016-10-27 16:27:32 +0200646 $ korapxml2krill -z --input <directory> --output <filename>
647
648Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200649It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200650
Akron941c1a62016-02-23 17:41:41 +0100651=over 2
652
653=item B<archive>
654
Akrona76d8352016-10-27 16:27:32 +0200655 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
656
Akron2fd402b2016-10-27 21:26:48 +0200657Converts an archive of KorAP-XML documents. It expects a directory
658(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100659
660=item B<extract>
661
Akrona76d8352016-10-27 16:27:32 +0200662 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
663
664Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100665
666=back
667
668
669=head1 OPTIONS
670
671=over 2
672
Akrona76d8352016-10-27 16:27:32 +0200673=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100674
Akrona76d8352016-10-27 16:27:32 +0200675Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100676
Akron7606afa2016-10-25 16:23:49 +0200677Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100678document, while C<archive> expects a KorAP-XML corpus folder or a zip
679file to batch process multiple files.
680C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200681
Akrona76d8352016-10-27 16:27:32 +0200682C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200683that the first archive listed contains all primary data files
684and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200685
Akron7606afa2016-10-25 16:23:49 +0200686 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200687
Akron0c3e3752016-06-28 15:55:53 +0200688(The directory structure follows the base directory format,
689that may include a C<.> root folder.
690In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200691need to be passed with a hash sign in front of the archive's name.
692This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200693
Akron7606afa2016-10-25 16:23:49 +0200694To support zip files, a version of C<unzip> needs to be installed that is
695compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200696
Akron7606afa2016-10-25 16:23:49 +0200697B<The root folder switch using the hash sign is experimental and
698may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200699
Akron941c1a62016-02-23 17:41:41 +0100700=item B<--output|-o> <directory|file>
701
702Output folder for archive processing or
703document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100704writes to C<STDOUT> by default
705(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100706
707=item B<--overwrite|-w>
708
709Overwrite files that already exist.
710
Akron3741f8b2016-12-21 19:55:21 +0100711=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100712
713Define the default tokenization by specifying
714the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100715of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100716
Akron3741f8b2016-12-21 19:55:21 +0100717
718=item B<--base-sentences|-bs> <foundry>#<layer>
719
720Define the layer for base sentences.
721If given, this will be used instead of using C<Base#Sentences>.
722Currently C<DeReKo#Structure> is the only additional layer supported.
723
724 Defaults to unset.
725
726
727=item B<--base-paragraphs|-bp> <foundry>#<layer>
728
729Define the layer for base paragraphs.
730If given, this will be used instead of using C<Base#Paragraphs>.
731Currently C<DeReKo#Structure> is the only additional layer supported.
732
733 Defaults to unset.
734
735
Akron41ac10b2017-02-08 22:47:25 +0100736=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
737
738Define the layer for base pagebreaks.
739Currently C<DeReKo#Structure> is the only layer supported.
740
741 Defaults to unset.
742
743
Akron941c1a62016-02-23 17:41:41 +0100744=item B<--skip|-s> <foundry>[#<layer>]
745
Akronf7ad89e2016-03-16 18:22:47 +0100746Skip specific annotations by specifying the foundry
747(and optionally the layer with a C<#>-prefix),
748e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100749Can be set multiple times.
750
Akronc13a1702016-03-15 19:33:14 +0100751=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100752
Akronf7ad89e2016-03-16 18:22:47 +0100753Convert specific annotations by specifying the foundry
754(and optionally the layer with a C<#>-prefix),
755e.g. C<Mate> or C<Mate#Morpho>.
756Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100757
758=item B<--primary|-p>
759
Akronc13a1702016-03-15 19:33:14 +0100760Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100761Can be flagged using C<--no-primary> as well.
762This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100763
764=item B<--jobs|-j>
765
766Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100767for archive processing.
Akron11c80302016-03-18 19:44:43 +0100768Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100769This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100770
Akron35db6e32016-03-17 22:42:22 +0100771=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100772
Akron35db6e32016-03-17 22:42:22 +0100773Define the metadata parser to use. Defaults to C<I5>.
774Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
775This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100776
777=item B<--pretty|-y>
778
Akronc13a1702016-03-15 19:33:14 +0100779Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100780This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100781
782=item B<--gzip|-z>
783
Akronf7ad89e2016-03-16 18:22:47 +0100784Compress the output.
785Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100786
Akron11c80302016-03-18 19:44:43 +0100787=item B<--cache|-c>
788
789File to mmap a cache (using L<Cache::FastMmap>).
790Defaults to C<korapxml2krill.cache> in the calling directory.
791
792=item B<--cache-size|-cs>
793
794Size of the cache. Defaults to C<50m>.
795
796=item B<--cache-init|-ci>
797
798Initialize cache file.
799Can be flagged using C<--no-cache-init> as well.
800Defaults to C<true>.
801
802=item B<--cache-delete|-cd>
803
804Delete cache file after processing.
805Can be flagged using C<--no-cache-delete> as well.
806Defaults to C<true>.
807
Akrone10ad322016-02-27 10:54:26 +0100808=item B<--sigle|-sg>
809
Akron20807582016-10-26 17:11:34 +0200810Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100811Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100812I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200813Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200814In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200815On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100816
Akron941c1a62016-02-23 17:41:41 +0100817=item B<--log|-l>
818
819The L<Log4perl> log level, defaults to C<ERROR>.
820
821=item B<--help|-h>
822
823Print this document.
824
825=item B<--version|-v>
826
827Print version information.
828
829=back
830
Akronc13a1702016-03-15 19:33:14 +0100831=head1 ANNOTATION SUPPORT
832
833L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
834developed in the KorAP project that are part of the KorAP preprocessing pipeline.
835The base foundry with paragraphs, sentences, and the text element are mandatory for
836L<Krill|https://github.com/KorAP/Krill>.
837
Akronf7ad89e2016-03-16 18:22:47 +0100838=over 2
Akronc13a1702016-03-15 19:33:14 +0100839
840=item B<Base>
841
842=over 4
843
Akronf7ad89e2016-03-16 18:22:47 +0100844=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100845
Akronf7ad89e2016-03-16 18:22:47 +0100846=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100847
848=back
849
850=item B<Connexor>
851
852=over 4
853
Akronf7ad89e2016-03-16 18:22:47 +0100854=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100855
Akronf7ad89e2016-03-16 18:22:47 +0100856=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100857
Akronf7ad89e2016-03-16 18:22:47 +0100858=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100859
Akronf7ad89e2016-03-16 18:22:47 +0100860=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100861
862=back
863
864=item B<CoreNLP>
865
866=over 4
867
Akronf7ad89e2016-03-16 18:22:47 +0100868=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100869
Akronf7ad89e2016-03-16 18:22:47 +0100870=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100871
Akronf7ad89e2016-03-16 18:22:47 +0100872=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100873
Akronf7ad89e2016-03-16 18:22:47 +0100874=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100875
876=back
877
878=item B<DeReKo>
879
880=over 4
881
Akronf7ad89e2016-03-16 18:22:47 +0100882=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100883
884=back
885
886=item B<Glemm>
887
888=over 4
889
Akronf7ad89e2016-03-16 18:22:47 +0100890=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100891
892=back
893
894=item B<Mate>
895
896=over 4
897
Akronf7ad89e2016-03-16 18:22:47 +0100898=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100899
Akronf7ad89e2016-03-16 18:22:47 +0100900=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100901
902=back
903
904=item B<OpenNLP>
905
906=over 4
907
Akronf7ad89e2016-03-16 18:22:47 +0100908=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100909
Akronf7ad89e2016-03-16 18:22:47 +0100910=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100911
912=back
913
914=item B<Sgbr>
915
916=over 4
917
Akronf7ad89e2016-03-16 18:22:47 +0100918=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100919
Akronf7ad89e2016-03-16 18:22:47 +0100920=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100921
922=back
923
Akron4fa37c32017-01-20 14:43:10 +0100924=item B<DRuKoLa>
925
926=over 4
927
928=item #Morpho
929
930=back
931
Akronc13a1702016-03-15 19:33:14 +0100932=item B<TreeTagger>
933
934=over 4
935
Akronf7ad89e2016-03-16 18:22:47 +0100936=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100937
Akronf7ad89e2016-03-16 18:22:47 +0100938=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100939
940=back
941
942=item B<XIP>
943
944=over 4
945
Akronf7ad89e2016-03-16 18:22:47 +0100946=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100947
Akronf7ad89e2016-03-16 18:22:47 +0100948=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100949
Akronf7ad89e2016-03-16 18:22:47 +0100950=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100951
952=back
953
954=back
955
956More importers are in preparation.
957New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
958See the built-in annotation importers as examples.
959
Akron941c1a62016-02-23 17:41:41 +0100960=head1 AVAILABILITY
961
962 https://github.com/KorAP/KorAP-XML-Krill
963
964
965=head1 COPYRIGHT AND LICENSE
966
Akron3ec0a1c2017-01-18 14:41:55 +0100967Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100968
Akron941c1a62016-02-23 17:41:41 +0100969Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200970Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100971
972L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
973Corpus Analysis Platform at the
974L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
975member of the
976L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
977
978This program is free software published under the
979L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
980
981=cut