blob: fab6147be4ee686c8ea4b0c0c71f51655cc0ef88 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020066#
67# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020068# - Fixed temporary path issue in script
69#
70# 2016/10/24
71# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020072#
Akronb4bbec72016-10-26 20:21:02 +020073# 2016/10/24
74# - Added support for document extraction
75#
Akron3741f8b2016-12-21 19:55:21 +010076# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020077# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020078#
Akron3741f8b2016-12-21 19:55:21 +010079# 2016/12/21
80# - added support for base-sentences and base-tokenizations
81#
Akron4fa37c32017-01-20 14:43:10 +010082# 2017/01/20
83# - added support for DRuKoLa annotations
84#
Akron41ac10b2017-02-08 22:47:25 +010085# 2017/02/08
86# - added support for pagebreak annotations
87#
Akron941c1a62016-02-23 17:41:41 +010088# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010089
Akron41ac10b2017-02-08 22:47:25 +010090our $LAST_CHANGE = '2017/02/08';
Akron941c1a62016-02-23 17:41:41 +010091our $LOCAL = $FindBin::Bin;
92our $VERSION_MSG = <<"VERSION";
93Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
94VERSION
95
Akron941c1a62016-02-23 17:41:41 +010096# Parse comand
97my $cmd;
98our @ARGV;
99if ($ARGV[0] && index($ARGV[0], '-') != 0) {
100 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100101};
Akron93d620e2016-02-05 19:40:05 +0100102
Akron5f51d422016-08-16 16:26:43 +0200103my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100104my $text;
Akrone10ad322016-02-27 10:54:26 +0100105
Akron941c1a62016-02-23 17:41:41 +0100106# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000107GetOptions(
Akron08385f62016-03-22 20:37:04 +0100108 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100109 'output|o=s' => \(my $output),
110 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100111 'meta|m=s' => \(my $meta),
Akronaf386982016-10-12 00:33:25 +0200112 'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron3741f8b2016-12-21 19:55:21 +0100113 'base-sentences|bs=s' => \(my $base_sentences = ''),
114 'base-paragraphs|bp=s' => \(my $base_paragraphs = ''),
Akron41ac10b2017-02-08 22:47:25 +0100115 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks = ''),
Akron941c1a62016-02-23 17:41:41 +0100116 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100117 'skip|s=s' => \@skip,
118 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +0100119 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron941c1a62016-02-23 17:41:41 +0100120 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +0200121 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100122 'primary|p!' => \(my $primary),
123 'pretty|y' => \(my $pretty),
124 'jobs|j=i' => \(my $jobs = 0),
Akron7d4cdd82016-08-17 21:39:45 +0200125 'cache-size|cs=s' => \(my $cache_size = '50m'),
126 'cache-delete|cd!' => \(my $cache_delete = 1),
127 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +0100128 'help|h' => sub {
129 pod2usage(
130 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200131 -verbose => 99,
132 -msg => $VERSION_MSG,
133 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100134 );
135 },
136 'version|v' => sub {
137 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200138 -verbose => 0,
139 -msg => $VERSION_MSG,
140 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100141 )
142 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000143);
144
Akron3741f8b2016-12-21 19:55:21 +0100145$base_sentences = lc $base_sentences;
146$base_paragraphs = lc $base_paragraphs;
147
Akron941c1a62016-02-23 17:41:41 +0100148my %ERROR_HASH = (
149 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200150 -verbose => 99,
151 -msg => $VERSION_MSG,
152 -output => '-',
153 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100154);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000155
Akron941c1a62016-02-23 17:41:41 +0100156# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100157pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000158
Akrone1dbc382016-07-08 22:24:52 +0200159# Gzip has no effect, if no output is given
160pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000161
Akron941c1a62016-02-23 17:41:41 +0100162# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000163Log::Log4perl->init({
164 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
165 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
166 'log4perl.appender.STDERR.layout' => 'PatternLayout',
167 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
168});
169
170my $log = Log::Log4perl->get_logger('main');
171
Akrone1dbc382016-07-08 22:24:52 +0200172my %skip;
173$skip{lc($_)} = 1 foreach @skip;
174
175my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100176push(@layers, ['Base', 'Sentences']) unless $base_sentences;
177push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200178
179# Connexor
180push(@layers, ['Connexor', 'Morpho']);
181push(@layers, ['Connexor', 'Syntax']);
182push(@layers, ['Connexor', 'Phrase']);
183push(@layers, ['Connexor', 'Sentences']);
184
185# CoreNLP
186push(@layers, ['CoreNLP', 'NamedEntities']);
187push(@layers, ['CoreNLP', 'Sentences']);
188push(@layers, ['CoreNLP', 'Morpho']);
189push(@layers, ['CoreNLP', 'Constituency']);
190
Akron3741f8b2016-12-21 19:55:21 +0100191
Akrone1dbc382016-07-08 22:24:52 +0200192# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100193my @dereko_attr = ();
194if ($base_sentences eq 'dereko#structure') {
195 push @dereko_attr, 'sentences';
196};
197if ($base_paragraphs eq 'dereko#structure') {
198 push @dereko_attr, 'paragraphs';
199};
200if ($base_pagebreaks eq 'dereko#structure') {
201 push @dereko_attr, 'pagebreaks';
202};
203
204if ($dereko_attr[0]) {
205 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100206}
207else {
208 push(@layers, ['DeReKo', 'Structure']);
209};
Akrone1dbc382016-07-08 22:24:52 +0200210
211# Glemm
212push(@layers, ['Glemm', 'Morpho']);
213
214# Malt
215push(@layers, ['Malt', 'Dependency']);
216
217# MDParser
218push(@layers, ['MDParser', 'Dependency']);
219
220# Mate
221push(@layers, ['Mate', 'Morpho']);
222push(@layers, ['Mate', 'Dependency']);
223
224# OpenNLP
225push(@layers, ['OpenNLP', 'Morpho']);
226push(@layers, ['OpenNLP', 'Sentences']);
227
228# Schreibgebrauch
229push(@layers, ['Sgbr', 'Lemma']);
230push(@layers, ['Sgbr', 'Morpho']);
231
232# TreeTagger
233push(@layers, ['TreeTagger', 'Morpho']);
234push(@layers, ['TreeTagger', 'Sentences']);
235
236# XIP
237push(@layers, ['XIP', 'Morpho']);
238push(@layers, ['XIP', 'Constituency']);
239push(@layers, ['XIP', 'Sentences']);
240push(@layers, ['XIP', 'Dependency']);
241
Akron4fa37c32017-01-20 14:43:10 +0100242# DRuKoLa
243push(@layers, ['DRuKoLa', 'Morpho']);
244
245
Akrone1dbc382016-07-08 22:24:52 +0200246# Check filters
247my @filtered_anno;
248if ($skip{'#all'}) {
249 foreach (@anno) {
250 push @filtered_anno, [ split('#', $_) ];
251 };
252}
253
254# Add all annotations that are not skipped
255else {
256 # Add to index file - respect skipping
257 foreach my $info (@layers) {
258 # Skip if Foundry or Foundry#Layer should be skipped
259 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
260 push @filtered_anno, $info;
261 };
262 };
263};
264
265# Get tokenization basis
266my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
267
268# TODO: This should not be initialized for batch
269my $cache = Cache::FastMmap->new(
270 share_file => $cache_file,
271 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200272 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200273);
274
Akron03b24db2016-08-16 20:54:32 +0200275# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200276my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200277 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200278 meta_type => $meta,
279 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200280 foundry => $token_base_foundry,
281 layer => $token_base_layer,
282 gzip => $gzip,
283 log => $log,
284 primary => $primary,
285 pretty => $pretty,
286 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200287);
288
Akron941c1a62016-02-23 17:41:41 +0100289
290# Get file name based on path information
291sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100292 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200293 if (-d $i) {
294 $i =~ s![^\/]+$!!;
295 };
Akron941c1a62016-02-23 17:41:41 +0100296 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200297
298 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200299 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100300 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100301 $file =~ tr/\//-/;
302 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200303 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100304 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000305};
306
Akrone10ad322016-02-27 10:54:26 +0100307# Convert sigle to path construct
308s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
309
Akron7d4cdd82016-08-17 21:39:45 +0200310if ($cmd) {
311 if ($output && (!-e $output || !-d $output)) {
312 print "Directory '$output' does not exist.\n\n";
313 exit(0);
314 };
315};
316
317
Akron941c1a62016-02-23 17:41:41 +0100318# Process a single file
319unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100320 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000321
Akron941c1a62016-02-23 17:41:41 +0100322 BEGIN {
323 $main::TIME = Benchmark->new;
324 $main::LAST_STOP = Benchmark->new;
325 };
326
327 sub stop_time {
328 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200329 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100330 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200331 timestr(timediff($new, $main::LAST_STOP)) .
332 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
333 );
Akron941c1a62016-02-23 17:41:41 +0100334 $main::LAST_STOP = $new;
335 };
336
337 # Create and parse new document
338 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100339
Akron7d4cdd82016-08-17 21:39:45 +0200340 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200341 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100342
Akron11c80302016-03-18 19:44:43 +0100343 # Delete cache file
344 unlink($cache_file) if $cache_delete;
345
Akron5f51d422016-08-16 16:26:43 +0200346 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000347}
Nils Diewald59094f22014-11-05 18:20:50 +0000348
Akrone10ad322016-02-27 10:54:26 +0100349# Extract XML files
350elsif ($cmd eq 'extract') {
351
Akron7d4cdd82016-08-17 21:39:45 +0200352 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200353 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100354
Akron7d4cdd82016-08-17 21:39:45 +0200355 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100356 unless ($archive->test_unzip) {
357 print "Unzip is not installed or incompatible.\n\n";
358 exit(1);
359 };
360
Akronb0c88db2016-06-29 16:33:18 +0200361 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200362 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200363
Akron651cb8d2016-08-16 21:44:49 +0200364 my $prefix = 1;
365
Akron03b24db2016-08-16 20:54:32 +0200366 # No sigles given
367 unless (@sigle) {
368
369 # Get files
370 foreach ($archive->list_texts) {
371
372 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200373 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200374
375 # TODO: Make this OS independent
376 push @sigle, join '/', $corpus, $doc, $text;
377 };
Akron20807582016-10-26 17:11:34 +0200378 }
379
380 # Check sigle for doc sigles
381 else {
382 my @new_sigle;
383
384 my $prefix_check = 0;
385
386 # Iterate over all sigle
387 foreach (@sigle) {
388
389 # Sigle is a doc sigle
390 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200391
Akron2812ba22016-10-28 21:55:59 +0200392 print "$_ ...\n";
Akron20807582016-10-26 17:11:34 +0200393 # Check if a prefix is needed
394 unless ($prefix_check) {
395 $prefix = $archive->check_prefix;
396 $prefix_check = 1;
397 };
398
399 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200400 my $path = ($prefix ? './' : '') . $_;
401
402 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200403 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200404 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200405 ) ? '' : 'not '
406 );
407 print "extracted.\n";
408 }
409 else {
410 push @new_sigle, $_;
411 };
412 };
413 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200414 };
415
Akrone10ad322016-02-27 10:54:26 +0100416 # Iterate over all given sigles and extract
417 foreach (@sigle) {
Akron2812ba22016-10-28 21:55:59 +0200418 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200419
Akron03b24db2016-08-16 20:54:32 +0200420 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200421 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200422 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200423 ($prefix ? './' : '') . $_, $output
424 ) ? '' : 'not '
425 );
Akrone10ad322016-02-27 10:54:26 +0100426 print "extracted.\n";
427 };
428
429 print "\n";
430 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200431 }
Akron7d4cdd82016-08-17 21:39:45 +0200432
433 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200434 else {
435 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100436 };
437}
438
Akron941c1a62016-02-23 17:41:41 +0100439# Process an archive
440elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000441
Akrone1dbc382016-07-08 22:24:52 +0200442 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100443
Akron7d4cdd82016-08-17 21:39:45 +0200444 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100445 my $pool = Parallel::ForkManager->new($jobs);
446
Akron7d4cdd82016-08-17 21:39:45 +0200447 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100448 my $iter = 1; # Current text in process
449
450 # Report on fork message
451 $pool->run_on_finish (
452 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200453 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100454 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200455
Akron08385f62016-03-22 20:37:04 +0100456 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200457 ($iter++) . "/$count]" .
458 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200459 ' ' . $data->[0] . "\n";
460 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100461 }
462 );
463
464 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200465 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100466 print "Reading data ...\n";
467
Akron7d4cdd82016-08-17 21:39:45 +0200468 # unless (Cache::FastMmap->new(
469 # share_file => $cache_file,
470 # cache_size => $cache_size,
471 # init_file => $cache_init
472 # )) {
473 # print "Unable to intialize cache '$cache_file'\n\n";
474 # exit(1);
475 # };
Akron11c80302016-03-18 19:44:43 +0100476
Akron941c1a62016-02-23 17:41:41 +0100477 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100478 if (-d $input[0]) {
479 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100480 my @dirs;
481 my $dir;
482
Akron7d4cdd82016-08-17 21:39:45 +0200483 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100484 while (1) {
485 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200486 push @dirs, $dir;
487 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100488 };
489 last unless $it->next;
490 };
491
492 print "Start processing ...\n";
493 $t = Benchmark->new;
494 $count = scalar @dirs;
495
496 DIRECTORY_LOOP:
497 for (my $i = 0; $i < $count; $i++) {
498
Akrone1dbc382016-07-08 22:24:52 +0200499 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200500 $output,
501 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200502 );
Akron941c1a62016-02-23 17:41:41 +0100503
504 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200505 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200506
Akron13d56622016-10-31 14:54:49 +0100507 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
508 $pool->finish(
509 0,
510 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
511 );
Akron3ec48972016-08-17 23:24:52 +0200512 }
513 else {
Akron4c0cf312016-10-15 16:42:09 +0200514 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200515 };
Akron941c1a62016-02-23 17:41:41 +0100516 };
517 }
518
519 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200520 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200521
Akron941c1a62016-02-23 17:41:41 +0100522 unless ($archive->test_unzip) {
523 print "Unzip is not installed or incompatible.\n\n";
524 exit(1);
525 };
526
Akron08385f62016-03-22 20:37:04 +0100527 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200528 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100529
Akron941c1a62016-02-23 17:41:41 +0100530 print "Start processing ...\n";
531 $t = Benchmark->new;
532 my @dirs = $archive->list_texts;
533 $count = scalar @dirs;
534
535 ARCHIVE_LOOP:
536 for (my $i = 0; $i < $count; $i++) {
537
538 # Split path information
539 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
540
Akrone1dbc382016-07-08 22:24:52 +0200541 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200542 $output,
543 get_file_name(
544 catfile($corpus, $doc, $text)
545 . '.json' . ($gzip ? '.gz' : '')
546 )
Akrone1dbc382016-07-08 22:24:52 +0200547 );
Akron941c1a62016-02-23 17:41:41 +0100548
549 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200550 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100551
Akron4c0cf312016-10-15 16:42:09 +0200552 # Create temporary file
553 $temp = File::Temp->newdir;
554
Akronbdf434a2016-10-24 17:42:07 +0200555 # TODO: Check if $filename exist at the beginning,
556 # because extraction can be horrible slow!
557
Akron941c1a62016-02-23 17:41:41 +0100558 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200559 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100560
Akron7d4cdd82016-08-17 21:39:45 +0200561 # Create corpus directory
562 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100563
Akron7d4cdd82016-08-17 21:39:45 +0200564 # Temporary directory
565 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100566
Akron7d4cdd82016-08-17 21:39:45 +0200567 # Write file
Akron13d56622016-10-31 14:54:49 +0100568 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200569 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100570 $pool->finish(
571 0,
572 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
573 );
574 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200575 }
576 else {
Akron4c0cf312016-10-15 16:42:09 +0200577 # Delete temporary file
578 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200579 };
Akron941c1a62016-02-23 17:41:41 +0100580 }
Akron7d4cdd82016-08-17 21:39:45 +0200581
582 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100583 else {
Akron4c0cf312016-10-15 16:42:09 +0200584 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100585 };
586 };
587 }
588
589 else {
590 print "Input is neither a directory nor an archive.\n\n";
591 };
592
593 $pool->wait_all_children;
594
Akron11c80302016-03-18 19:44:43 +0100595 # Delete cache file
596 unlink($cache_file) if $cache_delete;
597
Akron941c1a62016-02-23 17:41:41 +0100598 print "Done.\n";
599 print timestr(timediff(Benchmark->new, $t))."\n\n";
600}
601
602# Unknown command
603else {
604 warn "Unknown command '$cmd'.\n\n";
605 pod2usage(%ERROR_HASH);
606}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000607
608__END__
Akron941c1a62016-02-23 17:41:41 +0100609
610=pod
611
612=encoding utf8
613
614=head1 NAME
615
Akronf7ad89e2016-03-16 18:22:47 +0100616korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100617
618
619=head1 SYNOPSIS
620
Akrona76d8352016-10-27 16:27:32 +0200621 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100622
Akron2fd402b2016-10-27 21:26:48 +0200623
Akron941c1a62016-02-23 17:41:41 +0100624=head1 DESCRIPTION
625
626L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
627compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100628The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100629
630
631=head1 INSTALLATION
632
633The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
634
Akronaf386982016-10-12 00:33:25 +0200635 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100636
Akronc13a1702016-03-15 19:33:14 +0100637In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100638be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200639Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200640In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100641
642=head1 ARGUMENTS
643
Akrona76d8352016-10-27 16:27:32 +0200644 $ korapxml2krill -z --input <directory> --output <filename>
645
646Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200647It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200648
Akron941c1a62016-02-23 17:41:41 +0100649=over 2
650
651=item B<archive>
652
Akrona76d8352016-10-27 16:27:32 +0200653 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
654
Akron2fd402b2016-10-27 21:26:48 +0200655Converts an archive of KorAP-XML documents. It expects a directory
656(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100657
658=item B<extract>
659
Akrona76d8352016-10-27 16:27:32 +0200660 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
661
662Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100663
664=back
665
666
667=head1 OPTIONS
668
669=over 2
670
Akrona76d8352016-10-27 16:27:32 +0200671=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100672
Akrona76d8352016-10-27 16:27:32 +0200673Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100674
Akron7606afa2016-10-25 16:23:49 +0200675Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100676document, while C<archive> expects a KorAP-XML corpus folder or a zip
677file to batch process multiple files.
678C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200679
Akrona76d8352016-10-27 16:27:32 +0200680C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200681that the first archive listed contains all primary data files
682and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200683
Akron7606afa2016-10-25 16:23:49 +0200684 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200685
Akron0c3e3752016-06-28 15:55:53 +0200686(The directory structure follows the base directory format,
687that may include a C<.> root folder.
688In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200689need to be passed with a hash sign in front of the archive's name.
690This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200691
Akron7606afa2016-10-25 16:23:49 +0200692To support zip files, a version of C<unzip> needs to be installed that is
693compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200694
Akron7606afa2016-10-25 16:23:49 +0200695B<The root folder switch using the hash sign is experimental and
696may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200697
Akron941c1a62016-02-23 17:41:41 +0100698=item B<--output|-o> <directory|file>
699
700Output folder for archive processing or
701document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100702writes to C<STDOUT> by default
703(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100704
705=item B<--overwrite|-w>
706
707Overwrite files that already exist.
708
Akron3741f8b2016-12-21 19:55:21 +0100709=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100710
711Define the default tokenization by specifying
712the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100713of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100714
Akron3741f8b2016-12-21 19:55:21 +0100715
716=item B<--base-sentences|-bs> <foundry>#<layer>
717
718Define the layer for base sentences.
719If given, this will be used instead of using C<Base#Sentences>.
720Currently C<DeReKo#Structure> is the only additional layer supported.
721
722 Defaults to unset.
723
724
725=item B<--base-paragraphs|-bp> <foundry>#<layer>
726
727Define the layer for base paragraphs.
728If given, this will be used instead of using C<Base#Paragraphs>.
729Currently C<DeReKo#Structure> is the only additional layer supported.
730
731 Defaults to unset.
732
733
Akron41ac10b2017-02-08 22:47:25 +0100734=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
735
736Define the layer for base pagebreaks.
737Currently C<DeReKo#Structure> is the only layer supported.
738
739 Defaults to unset.
740
741
Akron941c1a62016-02-23 17:41:41 +0100742=item B<--skip|-s> <foundry>[#<layer>]
743
Akronf7ad89e2016-03-16 18:22:47 +0100744Skip specific annotations by specifying the foundry
745(and optionally the layer with a C<#>-prefix),
746e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100747Can be set multiple times.
748
Akronc13a1702016-03-15 19:33:14 +0100749=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100750
Akronf7ad89e2016-03-16 18:22:47 +0100751Convert specific annotations by specifying the foundry
752(and optionally the layer with a C<#>-prefix),
753e.g. C<Mate> or C<Mate#Morpho>.
754Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100755
756=item B<--primary|-p>
757
Akronc13a1702016-03-15 19:33:14 +0100758Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100759Can be flagged using C<--no-primary> as well.
760This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100761
762=item B<--jobs|-j>
763
764Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100765for archive processing.
Akron11c80302016-03-18 19:44:43 +0100766Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100767This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100768
Akron35db6e32016-03-17 22:42:22 +0100769=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100770
Akron35db6e32016-03-17 22:42:22 +0100771Define the metadata parser to use. Defaults to C<I5>.
772Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
773This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100774
775=item B<--pretty|-y>
776
Akronc13a1702016-03-15 19:33:14 +0100777Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100778This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100779
780=item B<--gzip|-z>
781
Akronf7ad89e2016-03-16 18:22:47 +0100782Compress the output.
783Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100784
Akron11c80302016-03-18 19:44:43 +0100785=item B<--cache|-c>
786
787File to mmap a cache (using L<Cache::FastMmap>).
788Defaults to C<korapxml2krill.cache> in the calling directory.
789
790=item B<--cache-size|-cs>
791
792Size of the cache. Defaults to C<50m>.
793
794=item B<--cache-init|-ci>
795
796Initialize cache file.
797Can be flagged using C<--no-cache-init> as well.
798Defaults to C<true>.
799
800=item B<--cache-delete|-cd>
801
802Delete cache file after processing.
803Can be flagged using C<--no-cache-delete> as well.
804Defaults to C<true>.
805
Akrone10ad322016-02-27 10:54:26 +0100806=item B<--sigle|-sg>
807
Akron20807582016-10-26 17:11:34 +0200808Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +0100809Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100810I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200811Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +0200812In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +0200813On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +0100814
Akron941c1a62016-02-23 17:41:41 +0100815=item B<--log|-l>
816
817The L<Log4perl> log level, defaults to C<ERROR>.
818
819=item B<--help|-h>
820
821Print this document.
822
823=item B<--version|-v>
824
825Print version information.
826
827=back
828
Akronc13a1702016-03-15 19:33:14 +0100829=head1 ANNOTATION SUPPORT
830
831L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
832developed in the KorAP project that are part of the KorAP preprocessing pipeline.
833The base foundry with paragraphs, sentences, and the text element are mandatory for
834L<Krill|https://github.com/KorAP/Krill>.
835
Akronf7ad89e2016-03-16 18:22:47 +0100836=over 2
Akronc13a1702016-03-15 19:33:14 +0100837
838=item B<Base>
839
840=over 4
841
Akronf7ad89e2016-03-16 18:22:47 +0100842=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100843
Akronf7ad89e2016-03-16 18:22:47 +0100844=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100845
846=back
847
848=item B<Connexor>
849
850=over 4
851
Akronf7ad89e2016-03-16 18:22:47 +0100852=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100853
Akronf7ad89e2016-03-16 18:22:47 +0100854=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100855
Akronf7ad89e2016-03-16 18:22:47 +0100856=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100857
Akronf7ad89e2016-03-16 18:22:47 +0100858=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100859
860=back
861
862=item B<CoreNLP>
863
864=over 4
865
Akronf7ad89e2016-03-16 18:22:47 +0100866=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100867
Akronf7ad89e2016-03-16 18:22:47 +0100868=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100869
Akronf7ad89e2016-03-16 18:22:47 +0100870=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100871
Akronf7ad89e2016-03-16 18:22:47 +0100872=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100873
874=back
875
876=item B<DeReKo>
877
878=over 4
879
Akronf7ad89e2016-03-16 18:22:47 +0100880=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100881
882=back
883
884=item B<Glemm>
885
886=over 4
887
Akronf7ad89e2016-03-16 18:22:47 +0100888=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100889
890=back
891
892=item B<Mate>
893
894=over 4
895
Akronf7ad89e2016-03-16 18:22:47 +0100896=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100897
Akronf7ad89e2016-03-16 18:22:47 +0100898=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100899
900=back
901
902=item B<OpenNLP>
903
904=over 4
905
Akronf7ad89e2016-03-16 18:22:47 +0100906=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100907
Akronf7ad89e2016-03-16 18:22:47 +0100908=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100909
910=back
911
912=item B<Sgbr>
913
914=over 4
915
Akronf7ad89e2016-03-16 18:22:47 +0100916=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100917
Akronf7ad89e2016-03-16 18:22:47 +0100918=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100919
920=back
921
Akron4fa37c32017-01-20 14:43:10 +0100922=item B<DRuKoLa>
923
924=over 4
925
926=item #Morpho
927
928=back
929
Akronc13a1702016-03-15 19:33:14 +0100930=item B<TreeTagger>
931
932=over 4
933
Akronf7ad89e2016-03-16 18:22:47 +0100934=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100935
Akronf7ad89e2016-03-16 18:22:47 +0100936=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100937
938=back
939
940=item B<XIP>
941
942=over 4
943
Akronf7ad89e2016-03-16 18:22:47 +0100944=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100945
Akronf7ad89e2016-03-16 18:22:47 +0100946=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100947
Akronf7ad89e2016-03-16 18:22:47 +0100948=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100949
950=back
951
952=back
953
954More importers are in preparation.
955New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
956See the built-in annotation importers as examples.
957
Akron941c1a62016-02-23 17:41:41 +0100958=head1 AVAILABILITY
959
960 https://github.com/KorAP/KorAP-XML-Krill
961
962
963=head1 COPYRIGHT AND LICENSE
964
Akron3ec0a1c2017-01-18 14:41:55 +0100965Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100966
Akron941c1a62016-02-23 17:41:41 +0100967Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +0200968Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +0100969
970L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
971Corpus Analysis Platform at the
972L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
973member of the
974L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
975
976This program is free software published under the
977L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
978
979=cut