blob: 65bc89a134a2dedd086f5b872ecc160444b2460c [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron941c1a62016-02-23 17:41:41 +010066# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010067
Akron5f51d422016-08-16 16:26:43 +020068our $LAST_CHANGE = '2016/08/16';
Akron941c1a62016-02-23 17:41:41 +010069our $LOCAL = $FindBin::Bin;
70our $VERSION_MSG = <<"VERSION";
71Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
72VERSION
73
Akron941c1a62016-02-23 17:41:41 +010074# Parse comand
75my $cmd;
76our @ARGV;
77if ($ARGV[0] && index($ARGV[0], '-') != 0) {
78 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010079};
Akron93d620e2016-02-05 19:40:05 +010080
Akron5f51d422016-08-16 16:26:43 +020081my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010082my $text;
Akrone10ad322016-02-27 10:54:26 +010083
Akron941c1a62016-02-23 17:41:41 +010084# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000085GetOptions(
Akron08385f62016-03-22 20:37:04 +010086 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010087 'output|o=s' => \(my $output),
88 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010089 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010090 'token|t=s' => \(my $token_base),
91 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010092 'skip|s=s' => \@skip,
93 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010094 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
95 'cache-size|cs=s' => \(my $cache_size = '50m'),
96 'cache-delete|cd!' => \(my $cache_delete = 1),
97 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010098 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +020099 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100100 'primary|p!' => \(my $primary),
101 'pretty|y' => \(my $pretty),
102 'jobs|j=i' => \(my $jobs = 0),
103 'help|h' => sub {
104 pod2usage(
105 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
106 -verbose => 99,
107 -msg => $VERSION_MSG,
Akrone2b902d2016-08-16 16:50:11 +0200108 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100109 );
110 },
111 'version|v' => sub {
112 pod2usage(
113 -verbose => 0,
Akrone2b902d2016-08-16 16:50:11 +0200114 -msg => $VERSION_MSG,
115 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100116 )
117 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000118);
119
Akron941c1a62016-02-23 17:41:41 +0100120my %ERROR_HASH = (
121 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
122 -verbose => 99,
123 -msg => $VERSION_MSG,
Akrone2b902d2016-08-16 16:50:11 +0200124 -output => '-',
Akron941c1a62016-02-23 17:41:41 +0100125 -exit => 1
126);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127
Akron941c1a62016-02-23 17:41:41 +0100128# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100129pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000130
Akrone1dbc382016-07-08 22:24:52 +0200131# Gzip has no effect, if no output is given
132pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000133
Akron941c1a62016-02-23 17:41:41 +0100134# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000135Log::Log4perl->init({
136 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
137 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
138 'log4perl.appender.STDERR.layout' => 'PatternLayout',
139 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
140});
141
142my $log = Log::Log4perl->get_logger('main');
143
Akrone1dbc382016-07-08 22:24:52 +0200144my %skip;
145$skip{lc($_)} = 1 foreach @skip;
146
147my @layers;
148push(@layers, ['Base', 'Sentences']);
149push(@layers, ['Base', 'Paragraphs']);
150
151# Connexor
152push(@layers, ['Connexor', 'Morpho']);
153push(@layers, ['Connexor', 'Syntax']);
154push(@layers, ['Connexor', 'Phrase']);
155push(@layers, ['Connexor', 'Sentences']);
156
157# CoreNLP
158push(@layers, ['CoreNLP', 'NamedEntities']);
159push(@layers, ['CoreNLP', 'Sentences']);
160push(@layers, ['CoreNLP', 'Morpho']);
161push(@layers, ['CoreNLP', 'Constituency']);
162
163# DeReKo
164push(@layers, ['DeReKo', 'Structure']);
165
166# Glemm
167push(@layers, ['Glemm', 'Morpho']);
168
169# Malt
170push(@layers, ['Malt', 'Dependency']);
171
172# MDParser
173push(@layers, ['MDParser', 'Dependency']);
174
175# Mate
176push(@layers, ['Mate', 'Morpho']);
177push(@layers, ['Mate', 'Dependency']);
178
179# OpenNLP
180push(@layers, ['OpenNLP', 'Morpho']);
181push(@layers, ['OpenNLP', 'Sentences']);
182
183# Schreibgebrauch
184push(@layers, ['Sgbr', 'Lemma']);
185push(@layers, ['Sgbr', 'Morpho']);
186
187# TreeTagger
188push(@layers, ['TreeTagger', 'Morpho']);
189push(@layers, ['TreeTagger', 'Sentences']);
190
191# XIP
192push(@layers, ['XIP', 'Morpho']);
193push(@layers, ['XIP', 'Constituency']);
194push(@layers, ['XIP', 'Sentences']);
195push(@layers, ['XIP', 'Dependency']);
196
197# Check filters
198my @filtered_anno;
199if ($skip{'#all'}) {
200 foreach (@anno) {
201 push @filtered_anno, [ split('#', $_) ];
202 };
203}
204
205# Add all annotations that are not skipped
206else {
207 # Add to index file - respect skipping
208 foreach my $info (@layers) {
209 # Skip if Foundry or Foundry#Layer should be skipped
210 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
211 push @filtered_anno, $info;
212 };
213 };
214};
215
216# Get tokenization basis
217my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
218
219# TODO: This should not be initialized for batch
220my $cache = Cache::FastMmap->new(
221 share_file => $cache_file,
222 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200223 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200224);
225
Akron03b24db2016-08-16 20:54:32 +0200226# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200227my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200228 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200229 meta_type => $meta,
230 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200231 foundry => $token_base_foundry,
232 layer => $token_base_layer,
233 gzip => $gzip,
234 log => $log,
235 primary => $primary,
236 pretty => $pretty,
237 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200238);
239
Akron941c1a62016-02-23 17:41:41 +0100240
241# Get file name based on path information
242sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100243 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100244 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200245 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100246 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100247 $file =~ tr/\//-/;
248 $file =~ s{^-+}{};
249 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000250};
251
Akron941c1a62016-02-23 17:41:41 +0100252
253# Write file
Akrone1dbc382016-07-08 22:24:52 +0200254#sub write_file {
255# my $anno = shift;
256# my $file = get_file_name $anno;
257#
258# # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
259#
260# my $call = 'perl ' . $LOCAL . '/korapxml2krill';
261# $call .= ' -i ' . $anno;
262# $call .= ' -o ' . $output . '/' . $file . '.json';
263# $call .= '.gz -z' if $gzip;
264# $call .= ' -m ' . $meta if $meta;
265# $call .= ' -w' if $overwrite;
266# $call .= ' -t ' . $token_base if $token_base;
267# $call .= ' -l ' . $log_level if $log_level;
268# $call .= ' -c ' . $cache_file;
269# $call .= ' -cs ' . $cache_size;
270# $call .= ' --no-cache-delete'; # Don't delete the cache
271# $call .= ' --no-cache-init'; # Don't initialize the cache
272# $call .= ' --no-primary ' if $primary;
273# $call .= ' -y ' . $pretty if $pretty;
274# $call .= ' -a ' . $_ foreach @anno;
275# $call .= ' -s ' . $_ foreach @skip;
276# system($call);
277# return "$file";
278#};
Nils Diewald7364d1f2013-11-05 19:26:35 +0000279
Nils Diewald2db9ad02013-10-29 19:26:43 +0000280
Akrone10ad322016-02-27 10:54:26 +0100281# Convert sigle to path construct
282s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
283
Akron941c1a62016-02-23 17:41:41 +0100284# Process a single file
285unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100286 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000287
Akron941c1a62016-02-23 17:41:41 +0100288 BEGIN {
289 $main::TIME = Benchmark->new;
290 $main::LAST_STOP = Benchmark->new;
291 };
292
293 sub stop_time {
294 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200295 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100296 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200297 timestr(timediff($new, $main::LAST_STOP)) .
298 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
299 );
Akron941c1a62016-02-23 17:41:41 +0100300 $main::LAST_STOP = $new;
301 };
302
303 # Create and parse new document
304 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100305
Akrone1dbc382016-07-08 22:24:52 +0200306 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100307
Akron11c80302016-03-18 19:44:43 +0100308 # Delete cache file
309 unlink($cache_file) if $cache_delete;
310
Akron5f51d422016-08-16 16:26:43 +0200311 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000312}
Nils Diewald59094f22014-11-05 18:20:50 +0000313
Akrone10ad322016-02-27 10:54:26 +0100314# Extract XML files
315elsif ($cmd eq 'extract') {
316
Akron03b24db2016-08-16 20:54:32 +0200317 if ($output && (!-e $output || !-d $output)) {
318 print "Directory '$output' does not exist.\n\n";
319 exit(0);
320 };
Akrone10ad322016-02-27 10:54:26 +0100321
Akronb0c88db2016-06-29 16:33:18 +0200322 # TODO: Support sigles and full archives
Akronb0c88db2016-06-29 16:33:18 +0200323 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100324
325 unless ($archive->test_unzip) {
326 print "Unzip is not installed or incompatible.\n\n";
327 exit(1);
328 };
329
Akronb0c88db2016-06-29 16:33:18 +0200330 # Add further annotation archived
331 $archive->attach($_) foreach @input;
332
Akron651cb8d2016-08-16 21:44:49 +0200333 my $prefix = 1;
334
Akron03b24db2016-08-16 20:54:32 +0200335 # No sigles given
336 unless (@sigle) {
337
338 # Get files
339 foreach ($archive->list_texts) {
340
341 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200342 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200343
344 # TODO: Make this OS independent
345 push @sigle, join '/', $corpus, $doc, $text;
346 };
347 };
348
Akrone10ad322016-02-27 10:54:26 +0100349 # Iterate over all given sigles and extract
350 foreach (@sigle) {
351 print "$_ ";
Akron03b24db2016-08-16 20:54:32 +0200352 # TODO: Make this OS independent
Akron651cb8d2016-08-16 21:44:49 +0200353 print '' . (
354 $archive->extract(
355 ($prefix ? './' : '') . $_, $output
356 ) ? '' : 'not '
357 );
Akrone10ad322016-02-27 10:54:26 +0100358 print "extracted.\n";
359 };
360
361 print "\n";
362 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200363 }
364 else {
365 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100366 };
367}
368
Akron941c1a62016-02-23 17:41:41 +0100369# Process an archive
370elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000371
Akrone1dbc382016-07-08 22:24:52 +0200372warn '!!!!!!!!!!!!!------------> ';
Akrone10ad322016-02-27 10:54:26 +0100373
Akrone1dbc382016-07-08 22:24:52 +0200374if ($output && (!-e $output || !-d $output)) {
375 print "Directory '$output' does not exist.\n\n";
376 exit(0);
377};
378
379
380 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100381
382 if ($output && (!-e $output || !-d $output)) {
383 print "Directory '$output' does not exist.\n\n";
384 exit(0);
385 };
386
Akron651cb8d2016-08-16 21:44:49 +0200387# Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100388 my $pool = Parallel::ForkManager->new($jobs);
389
390 my $count = 0; # Texts to process
391 my $iter = 1; # Current text in process
392
393 # Report on fork message
394 $pool->run_on_finish (
395 sub {
396 my ($pid, $code) = shift;
397 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100398 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200399 ($iter++) . "/$count]" .
400 ($code ? " $code" : '') .
401 " $$data\n";
Akron941c1a62016-02-23 17:41:41 +0100402 }
403 );
404
405 my $t;
406 print "Reading data ...\n";
407
Akrone1dbc382016-07-08 22:24:52 +0200408# unless (Cache::FastMmap->new(
409# share_file => $cache_file,
410# cache_size => $cache_size,
411# init_file => $cache_init
412# )) {
413# print "Unable to intialize cache '$cache_file'\n\n";
414# exit(1);
415# };
Akron11c80302016-03-18 19:44:43 +0100416
Akron941c1a62016-02-23 17:41:41 +0100417 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100418 if (-d $input[0]) {
419 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100420 my @dirs;
421 my $dir;
422
423 while (1) {
424 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
425 push @dirs, $dir;
426 $it->prune;
427 };
428 last unless $it->next;
429 };
430
431 print "Start processing ...\n";
432 $t = Benchmark->new;
433 $count = scalar @dirs;
434
435 DIRECTORY_LOOP:
436 for (my $i = 0; $i < $count; $i++) {
437
Akrone1dbc382016-07-08 22:24:52 +0200438 my $filename = catfile(
439 $output,
440 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
441 );
Akron941c1a62016-02-23 17:41:41 +0100442
443 # Get the next fork
444 my $pid = $pool->start and next DIRECTORY_LOOP;
445 my $msg;
446
Akrone1dbc382016-07-08 22:24:52 +0200447 $msg = $batch_file->process($dirs[$i] => $filename);
Akron941c1a62016-02-23 17:41:41 +0100448 $pool->finish(0, \$msg);
449 };
450 }
451
452 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200453 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200454
Akron941c1a62016-02-23 17:41:41 +0100455 unless ($archive->test_unzip) {
456 print "Unzip is not installed or incompatible.\n\n";
457 exit(1);
458 };
459
Akron08385f62016-03-22 20:37:04 +0100460 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200461 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100462
Akron941c1a62016-02-23 17:41:41 +0100463 print "Start processing ...\n";
464 $t = Benchmark->new;
465 my @dirs = $archive->list_texts;
466 $count = scalar @dirs;
467
468 ARCHIVE_LOOP:
469 for (my $i = 0; $i < $count; $i++) {
470
471 # Split path information
472 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
473
Akrone1dbc382016-07-08 22:24:52 +0200474 my $filename = catfile(
475 $output,
476 get_file_name(
477 catfile($corpus, $doc, $text)
478 . '.json' . ($gzip ? '.gz' : '')
479 )
480 );
Akron941c1a62016-02-23 17:41:41 +0100481
482 # Get the next fork
483 my $pid = $pool->start and next ARCHIVE_LOOP;
484
485 # Create temporary file
486 my $temp = File::Temp->newdir;
487
488 my $msg;
489
490 # Extract from archive
491 if ($archive->extract($dirs[$i], $temp)) {
492
493 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100494 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100495
496 # Temporary directory
497 my $dir = catdir($input, $doc, $text);
498
499 # Write file
Akrone1dbc382016-07-08 22:24:52 +0200500 $msg = $batch_file->process($dir => $output);
Akron941c1a62016-02-23 17:41:41 +0100501
502 $temp = undef;
503 $pool->finish(0, \$msg);
504 }
505 else {
506
507 $temp = undef;
508 $msg = "Unable to extract " . $dirs[$i] . "\n";
509 $pool->finish(1, \$msg);
510 };
511 };
512 }
513
514 else {
515 print "Input is neither a directory nor an archive.\n\n";
516 };
517
518 $pool->wait_all_children;
519
Akron11c80302016-03-18 19:44:43 +0100520 # Delete cache file
521 unlink($cache_file) if $cache_delete;
522
Akron941c1a62016-02-23 17:41:41 +0100523 print "Done.\n";
524 print timestr(timediff(Benchmark->new, $t))."\n\n";
525}
526
527# Unknown command
528else {
529 warn "Unknown command '$cmd'.\n\n";
530 pod2usage(%ERROR_HASH);
531}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000532
533__END__
Akron941c1a62016-02-23 17:41:41 +0100534
535=pod
536
537=encoding utf8
538
539=head1 NAME
540
Akronf7ad89e2016-03-16 18:22:47 +0100541korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100542
543
544=head1 SYNOPSIS
545
Akronc13a1702016-03-15 19:33:14 +0100546 $ korapxml2krill -z --input <directory> --output <filename>
547 $ korapxml2krill archive -z --input <directory> --output <directory>
548 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100549
550
551=head1 DESCRIPTION
552
553L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
554compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100555The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100556
557
558=head1 INSTALLATION
559
560The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
561
562 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
563
Akronc13a1702016-03-15 19:33:14 +0100564In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100565be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100566
567
568=head1 ARGUMENTS
569
570=over 2
571
572=item B<archive>
573
Akrone10ad322016-02-27 10:54:26 +0100574Process an archive as a Zip-file or a folder of KorAP-XML documents.
575
576=item B<extract>
577
578Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100579
580=back
581
582
583=head1 OPTIONS
584
585=over 2
586
Akron2cfe8092016-06-24 17:48:49 +0200587=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100588
Akronf7ad89e2016-03-16 18:22:47 +0100589Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100590
Akron0c3e3752016-06-28 15:55:53 +0200591Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200592that the first archive listed contains all primary data files
593and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200594
595 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
596
Akron0c3e3752016-06-28 15:55:53 +0200597(The directory structure follows the base directory format,
598that may include a C<.> root folder.
599In this case further archives lacking a C<.> root folder
600need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200601
Akron651cb8d2016-08-16 21:44:49 +0200602B<The root folder switch is experimental and may vanish in future versions.>
603
Akron941c1a62016-02-23 17:41:41 +0100604=item B<--output|-o> <directory|file>
605
606Output folder for archive processing or
607document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100608writes to C<STDOUT> by default
609(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100610
611=item B<--overwrite|-w>
612
613Overwrite files that already exist.
614
615=item B<--token|-t> <foundry>[#<file>]
616
617Define the default tokenization by specifying
618the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100619of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100620
621=item B<--skip|-s> <foundry>[#<layer>]
622
Akronf7ad89e2016-03-16 18:22:47 +0100623Skip specific annotations by specifying the foundry
624(and optionally the layer with a C<#>-prefix),
625e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100626Can be set multiple times.
627
Akronc13a1702016-03-15 19:33:14 +0100628=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100629
Akronf7ad89e2016-03-16 18:22:47 +0100630Convert specific annotations by specifying the foundry
631(and optionally the layer with a C<#>-prefix),
632e.g. C<Mate> or C<Mate#Morpho>.
633Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100634
635=item B<--primary|-p>
636
Akronc13a1702016-03-15 19:33:14 +0100637Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100638Can be flagged using C<--no-primary> as well.
639This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100640
641=item B<--jobs|-j>
642
643Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100644for archive processing.
Akron11c80302016-03-18 19:44:43 +0100645Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100646This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100647
Akron35db6e32016-03-17 22:42:22 +0100648=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100649
Akron35db6e32016-03-17 22:42:22 +0100650Define the metadata parser to use. Defaults to C<I5>.
651Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
652This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100653
654=item B<--pretty|-y>
655
Akronc13a1702016-03-15 19:33:14 +0100656Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100657This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100658
659=item B<--gzip|-z>
660
Akronf7ad89e2016-03-16 18:22:47 +0100661Compress the output.
662Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100663
Akron11c80302016-03-18 19:44:43 +0100664=item B<--cache|-c>
665
666File to mmap a cache (using L<Cache::FastMmap>).
667Defaults to C<korapxml2krill.cache> in the calling directory.
668
669=item B<--cache-size|-cs>
670
671Size of the cache. Defaults to C<50m>.
672
673=item B<--cache-init|-ci>
674
675Initialize cache file.
676Can be flagged using C<--no-cache-init> as well.
677Defaults to C<true>.
678
679=item B<--cache-delete|-cd>
680
681Delete cache file after processing.
682Can be flagged using C<--no-cache-delete> as well.
683Defaults to C<true>.
684
Akrone10ad322016-02-27 10:54:26 +0100685=item B<--sigle|-sg>
686
687Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100688Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100689I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200690Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100691
Akron941c1a62016-02-23 17:41:41 +0100692=item B<--log|-l>
693
694The L<Log4perl> log level, defaults to C<ERROR>.
695
696=item B<--help|-h>
697
698Print this document.
699
700=item B<--version|-v>
701
702Print version information.
703
704=back
705
Akronc13a1702016-03-15 19:33:14 +0100706=head1 ANNOTATION SUPPORT
707
708L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
709developed in the KorAP project that are part of the KorAP preprocessing pipeline.
710The base foundry with paragraphs, sentences, and the text element are mandatory for
711L<Krill|https://github.com/KorAP/Krill>.
712
Akronf7ad89e2016-03-16 18:22:47 +0100713=over 2
Akronc13a1702016-03-15 19:33:14 +0100714
715=item B<Base>
716
717=over 4
718
Akronf7ad89e2016-03-16 18:22:47 +0100719=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100720
Akronf7ad89e2016-03-16 18:22:47 +0100721=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100722
723=back
724
725=item B<Connexor>
726
727=over 4
728
Akronf7ad89e2016-03-16 18:22:47 +0100729=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100730
Akronf7ad89e2016-03-16 18:22:47 +0100731=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100732
Akronf7ad89e2016-03-16 18:22:47 +0100733=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100734
Akronf7ad89e2016-03-16 18:22:47 +0100735=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100736
737=back
738
739=item B<CoreNLP>
740
741=over 4
742
Akronf7ad89e2016-03-16 18:22:47 +0100743=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100744
Akronf7ad89e2016-03-16 18:22:47 +0100745=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100746
Akronf7ad89e2016-03-16 18:22:47 +0100747=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100748
Akronf7ad89e2016-03-16 18:22:47 +0100749=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100750
751=back
752
753=item B<DeReKo>
754
755=over 4
756
Akronf7ad89e2016-03-16 18:22:47 +0100757=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100758
759=back
760
761=item B<Glemm>
762
763=over 4
764
Akronf7ad89e2016-03-16 18:22:47 +0100765=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100766
767=back
768
769=item B<Mate>
770
771=over 4
772
Akronf7ad89e2016-03-16 18:22:47 +0100773=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100774
Akronf7ad89e2016-03-16 18:22:47 +0100775=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100776
777=back
778
779=item B<OpenNLP>
780
781=over 4
782
Akronf7ad89e2016-03-16 18:22:47 +0100783=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100784
Akronf7ad89e2016-03-16 18:22:47 +0100785=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100786
787=back
788
789=item B<Sgbr>
790
791=over 4
792
Akronf7ad89e2016-03-16 18:22:47 +0100793=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100794
Akronf7ad89e2016-03-16 18:22:47 +0100795=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100796
797=back
798
799=item B<TreeTagger>
800
801=over 4
802
Akronf7ad89e2016-03-16 18:22:47 +0100803=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100804
Akronf7ad89e2016-03-16 18:22:47 +0100805=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100806
807=back
808
809=item B<XIP>
810
811=over 4
812
Akronf7ad89e2016-03-16 18:22:47 +0100813=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100814
Akronf7ad89e2016-03-16 18:22:47 +0100815=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100816
Akronf7ad89e2016-03-16 18:22:47 +0100817=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100818
819=back
820
821=back
822
823More importers are in preparation.
824New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
825See the built-in annotation importers as examples.
826
Akron941c1a62016-02-23 17:41:41 +0100827=head1 AVAILABILITY
828
829 https://github.com/KorAP/KorAP-XML-Krill
830
831
832=head1 COPYRIGHT AND LICENSE
833
834Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100835
Akron941c1a62016-02-23 17:41:41 +0100836Author: L<Nils Diewald|http://nils-diewald.de/>
837
838L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
839Corpus Analysis Platform at the
840L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
841member of the
842L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
843
844This program is free software published under the
845L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
846
847=cut