blob: 4539f5dab5aa0238ba0f2f8cfa6e6097d56d6026 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000010use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010011use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010012use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010013use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010014use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010015use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010016use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020017use KorAP::XML::Batch::File;
Akron941c1a62016-02-23 17:41:41 +010018use Parallel::ForkManager;
Akron75ba57d2016-03-07 23:36:27 +010019# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010020# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010021
Akron941c1a62016-02-23 17:41:41 +010022# CHANGES:
23# ----------------------------------------------------------
24# 2013/11/25
25# - Initial release
26#
27# 2014/10/29
28# - Merges foundry data to create indexer friendly documents
29#
Akron93d620e2016-02-05 19:40:05 +010030# 2016/02/04
31# - renamed to korapxml2krill
32# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010033#
34# 2016/02/12
35# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010036# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010037#
38# 2016/02/14
39# - Added version information
Akron941c1a62016-02-23 17:41:41 +010040# - Added support for archive files
41#
42# 2016/02/15
43# - Fixed temporary directory bug
44# - Improved skipping before unzipping
45# - Added EXPERIMENTAL concurrency support
46#
47# 2016/02/23
48# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010049#
50# 2016/02/27
51# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010052#
53# 2016/03/17
54# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010055#
56# 2016/03/18
57# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020058#
Akronf3f0c942016-06-27 13:27:14 +020059# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020060# - Added multi archive support
61# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020062# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020063#
64# 2016/07/06
65# - Added MDParser#Dependency
Akron941c1a62016-02-23 17:41:41 +010066# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +010067
Akron5f51d422016-08-16 16:26:43 +020068our $LAST_CHANGE = '2016/08/16';
Akron941c1a62016-02-23 17:41:41 +010069our $LOCAL = $FindBin::Bin;
70our $VERSION_MSG = <<"VERSION";
71Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
72VERSION
73
Akron941c1a62016-02-23 17:41:41 +010074# Parse comand
75my $cmd;
76our @ARGV;
77if ($ARGV[0] && index($ARGV[0], '-') != 0) {
78 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +010079};
Akron93d620e2016-02-05 19:40:05 +010080
Akron5f51d422016-08-16 16:26:43 +020081my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +010082my $text;
Akrone10ad322016-02-27 10:54:26 +010083
Akron941c1a62016-02-23 17:41:41 +010084# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +000085GetOptions(
Akron08385f62016-03-22 20:37:04 +010086 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +010087 'output|o=s' => \(my $output),
88 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +010089 'meta|m=s' => \(my $meta),
Akron941c1a62016-02-23 17:41:41 +010090 'token|t=s' => \(my $token_base),
91 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +010092 'skip|s=s' => \@skip,
93 'sigle|sg=s' => \@sigle,
Akron11c80302016-03-18 19:44:43 +010094 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
95 'cache-size|cs=s' => \(my $cache_size = '50m'),
96 'cache-delete|cd!' => \(my $cache_delete = 1),
97 'cache-init|ci!' => \(my $cache_init = 1),
Akron941c1a62016-02-23 17:41:41 +010098 'log|l=s' => \(my $log_level = 'ERROR'),
Akron5f51d422016-08-16 16:26:43 +020099 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100100 'primary|p!' => \(my $primary),
101 'pretty|y' => \(my $pretty),
102 'jobs|j=i' => \(my $jobs = 0),
103 'help|h' => sub {
104 pod2usage(
105 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
106 -verbose => 99,
107 -msg => $VERSION_MSG,
Akrone2b902d2016-08-16 16:50:11 +0200108 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100109 );
110 },
111 'version|v' => sub {
112 pod2usage(
113 -verbose => 0,
Akrone2b902d2016-08-16 16:50:11 +0200114 -msg => $VERSION_MSG,
115 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100116 )
117 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000118);
119
Akron941c1a62016-02-23 17:41:41 +0100120my %ERROR_HASH = (
121 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
122 -verbose => 99,
123 -msg => $VERSION_MSG,
Akrone2b902d2016-08-16 16:50:11 +0200124 -output => '-',
Akron941c1a62016-02-23 17:41:41 +0100125 -exit => 1
126);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000127
Akron941c1a62016-02-23 17:41:41 +0100128# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100129pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000130
Akrone1dbc382016-07-08 22:24:52 +0200131# Gzip has no effect, if no output is given
132pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000133
Akron941c1a62016-02-23 17:41:41 +0100134# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000135Log::Log4perl->init({
136 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
137 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
138 'log4perl.appender.STDERR.layout' => 'PatternLayout',
139 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
140});
141
142my $log = Log::Log4perl->get_logger('main');
143
Akrone1dbc382016-07-08 22:24:52 +0200144my %skip;
145$skip{lc($_)} = 1 foreach @skip;
146
147my @layers;
148push(@layers, ['Base', 'Sentences']);
149push(@layers, ['Base', 'Paragraphs']);
150
151# Connexor
152push(@layers, ['Connexor', 'Morpho']);
153push(@layers, ['Connexor', 'Syntax']);
154push(@layers, ['Connexor', 'Phrase']);
155push(@layers, ['Connexor', 'Sentences']);
156
157# CoreNLP
158push(@layers, ['CoreNLP', 'NamedEntities']);
159push(@layers, ['CoreNLP', 'Sentences']);
160push(@layers, ['CoreNLP', 'Morpho']);
161push(@layers, ['CoreNLP', 'Constituency']);
162
163# DeReKo
164push(@layers, ['DeReKo', 'Structure']);
165
166# Glemm
167push(@layers, ['Glemm', 'Morpho']);
168
169# Malt
170push(@layers, ['Malt', 'Dependency']);
171
172# MDParser
173push(@layers, ['MDParser', 'Dependency']);
174
175# Mate
176push(@layers, ['Mate', 'Morpho']);
177push(@layers, ['Mate', 'Dependency']);
178
179# OpenNLP
180push(@layers, ['OpenNLP', 'Morpho']);
181push(@layers, ['OpenNLP', 'Sentences']);
182
183# Schreibgebrauch
184push(@layers, ['Sgbr', 'Lemma']);
185push(@layers, ['Sgbr', 'Morpho']);
186
187# TreeTagger
188push(@layers, ['TreeTagger', 'Morpho']);
189push(@layers, ['TreeTagger', 'Sentences']);
190
191# XIP
192push(@layers, ['XIP', 'Morpho']);
193push(@layers, ['XIP', 'Constituency']);
194push(@layers, ['XIP', 'Sentences']);
195push(@layers, ['XIP', 'Dependency']);
196
197# Check filters
198my @filtered_anno;
199if ($skip{'#all'}) {
200 foreach (@anno) {
201 push @filtered_anno, [ split('#', $_) ];
202 };
203}
204
205# Add all annotations that are not skipped
206else {
207 # Add to index file - respect skipping
208 foreach my $info (@layers) {
209 # Skip if Foundry or Foundry#Layer should be skipped
210 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
211 push @filtered_anno, $info;
212 };
213 };
214};
215
216# Get tokenization basis
217my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
218
219# TODO: This should not be initialized for batch
220my $cache = Cache::FastMmap->new(
221 share_file => $cache_file,
222 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200223 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200224);
225
Akron03b24db2016-08-16 20:54:32 +0200226# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200227my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200228 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200229 meta_type => $meta,
230 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200231 foundry => $token_base_foundry,
232 layer => $token_base_layer,
233 gzip => $gzip,
234 log => $log,
235 primary => $primary,
236 pretty => $pretty,
237 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200238);
239
Akron941c1a62016-02-23 17:41:41 +0100240
241# Get file name based on path information
242sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100243 my $i = $input[0];
Akron941c1a62016-02-23 17:41:41 +0100244 my $file = shift;
Akron62557602016-06-27 14:10:13 +0200245 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100246 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100247 $file =~ tr/\//-/;
248 $file =~ s{^-+}{};
249 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000250};
251
Akron941c1a62016-02-23 17:41:41 +0100252
253# Write file
Akrone1dbc382016-07-08 22:24:52 +0200254#sub write_file {
255# my $anno = shift;
256# my $file = get_file_name $anno;
257#
258# # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
259#
260# my $call = 'perl ' . $LOCAL . '/korapxml2krill';
261# $call .= ' -i ' . $anno;
262# $call .= ' -o ' . $output . '/' . $file . '.json';
263# $call .= '.gz -z' if $gzip;
264# $call .= ' -m ' . $meta if $meta;
265# $call .= ' -w' if $overwrite;
266# $call .= ' -t ' . $token_base if $token_base;
267# $call .= ' -l ' . $log_level if $log_level;
268# $call .= ' -c ' . $cache_file;
269# $call .= ' -cs ' . $cache_size;
270# $call .= ' --no-cache-delete'; # Don't delete the cache
271# $call .= ' --no-cache-init'; # Don't initialize the cache
272# $call .= ' --no-primary ' if $primary;
273# $call .= ' -y ' . $pretty if $pretty;
274# $call .= ' -a ' . $_ foreach @anno;
275# $call .= ' -s ' . $_ foreach @skip;
276# system($call);
277# return "$file";
278#};
Nils Diewald7364d1f2013-11-05 19:26:35 +0000279
Nils Diewald2db9ad02013-10-29 19:26:43 +0000280
Akrone10ad322016-02-27 10:54:26 +0100281# Convert sigle to path construct
282s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
283
Akron941c1a62016-02-23 17:41:41 +0100284# Process a single file
285unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100286 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000287
Akron941c1a62016-02-23 17:41:41 +0100288 BEGIN {
289 $main::TIME = Benchmark->new;
290 $main::LAST_STOP = Benchmark->new;
291 };
292
293 sub stop_time {
294 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200295 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100296 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200297 timestr(timediff($new, $main::LAST_STOP)) .
298 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
299 );
Akron941c1a62016-02-23 17:41:41 +0100300 $main::LAST_STOP = $new;
301 };
302
303 # Create and parse new document
304 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100305
Akrone1dbc382016-07-08 22:24:52 +0200306 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100307
Akron11c80302016-03-18 19:44:43 +0100308 # Delete cache file
309 unlink($cache_file) if $cache_delete;
310
Akron5f51d422016-08-16 16:26:43 +0200311 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000312}
Nils Diewald59094f22014-11-05 18:20:50 +0000313
Akrone10ad322016-02-27 10:54:26 +0100314# Extract XML files
315elsif ($cmd eq 'extract') {
316
Akron03b24db2016-08-16 20:54:32 +0200317 if ($output && (!-e $output || !-d $output)) {
318 print "Directory '$output' does not exist.\n\n";
319 exit(0);
320 };
Akrone10ad322016-02-27 10:54:26 +0100321
Akronb0c88db2016-06-29 16:33:18 +0200322 # TODO: Support sigles and full archives
Akron08385f62016-03-22 20:37:04 +0100323
Akronb0c88db2016-06-29 16:33:18 +0200324 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100325
326 unless ($archive->test_unzip) {
327 print "Unzip is not installed or incompatible.\n\n";
328 exit(1);
329 };
330
Akronb0c88db2016-06-29 16:33:18 +0200331 # Add further annotation archived
332 $archive->attach($_) foreach @input;
333
Akron03b24db2016-08-16 20:54:32 +0200334 # No sigles given
335 unless (@sigle) {
336
337 # Get files
338 foreach ($archive->list_texts) {
339
340 # Split path information
341 my ($prefix, $corpus, $doc, $text) = $archive->split_path($_);
342
343 # TODO: Make this OS independent
344 push @sigle, join '/', $corpus, $doc, $text;
345 };
346 };
347
Akrone10ad322016-02-27 10:54:26 +0100348 # Iterate over all given sigles and extract
349 foreach (@sigle) {
350 print "$_ ";
Akron03b24db2016-08-16 20:54:32 +0200351 # TODO: Make this OS independent
Akronb0c88db2016-06-29 16:33:18 +0200352 print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
Akrone10ad322016-02-27 10:54:26 +0100353 print "extracted.\n";
354 };
355
356 print "\n";
357 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200358 }
359 else {
360 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100361 };
362}
363
Akron941c1a62016-02-23 17:41:41 +0100364# Process an archive
365elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000366
Akrone1dbc382016-07-08 22:24:52 +0200367warn '!!!!!!!!!!!!!------------> ';
Akrone10ad322016-02-27 10:54:26 +0100368
Akrone1dbc382016-07-08 22:24:52 +0200369if ($output && (!-e $output || !-d $output)) {
370 print "Directory '$output' does not exist.\n\n";
371 exit(0);
372};
373
374
375 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100376
377 if ($output && (!-e $output || !-d $output)) {
378 print "Directory '$output' does not exist.\n\n";
379 exit(0);
380 };
381
382 # Zero means: everything runs in the parent process
383 my $pool = Parallel::ForkManager->new($jobs);
384
385 my $count = 0; # Texts to process
386 my $iter = 1; # Current text in process
387
388 # Report on fork message
389 $pool->run_on_finish (
390 sub {
391 my ($pid, $code) = shift;
392 my $data = pop;
Akron08385f62016-03-22 20:37:04 +0100393 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron941c1a62016-02-23 17:41:41 +0100394 ($iter++) . "/$count]" .
395 ($code ? " $code" : '') .
396 " $$data\n";
397 }
398 );
399
400 my $t;
401 print "Reading data ...\n";
402
Akrone1dbc382016-07-08 22:24:52 +0200403# unless (Cache::FastMmap->new(
404# share_file => $cache_file,
405# cache_size => $cache_size,
406# init_file => $cache_init
407# )) {
408# print "Unable to intialize cache '$cache_file'\n\n";
409# exit(1);
410# };
Akron11c80302016-03-18 19:44:43 +0100411
Akron941c1a62016-02-23 17:41:41 +0100412 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100413 if (-d $input[0]) {
414 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100415 my @dirs;
416 my $dir;
417
418 while (1) {
419 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
420 push @dirs, $dir;
421 $it->prune;
422 };
423 last unless $it->next;
424 };
425
426 print "Start processing ...\n";
427 $t = Benchmark->new;
428 $count = scalar @dirs;
429
430 DIRECTORY_LOOP:
431 for (my $i = 0; $i < $count; $i++) {
432
Akrone1dbc382016-07-08 22:24:52 +0200433 my $filename = catfile(
434 $output,
435 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
436 );
Akron941c1a62016-02-23 17:41:41 +0100437
438 # Get the next fork
439 my $pid = $pool->start and next DIRECTORY_LOOP;
440 my $msg;
441
Akrone1dbc382016-07-08 22:24:52 +0200442 $msg = $batch_file->process($dirs[$i] => $filename);
Akron941c1a62016-02-23 17:41:41 +0100443 $pool->finish(0, \$msg);
444 };
445 }
446
447 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200448 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200449
Akron941c1a62016-02-23 17:41:41 +0100450 unless ($archive->test_unzip) {
451 print "Unzip is not installed or incompatible.\n\n";
452 exit(1);
453 };
454
Akron08385f62016-03-22 20:37:04 +0100455 # Add further annotation archived
Akron29866ac2016-06-24 16:40:47 +0200456 $archive->attach($_) foreach @input;
Akron08385f62016-03-22 20:37:04 +0100457
Akron941c1a62016-02-23 17:41:41 +0100458 print "Start processing ...\n";
459 $t = Benchmark->new;
460 my @dirs = $archive->list_texts;
461 $count = scalar @dirs;
462
463 ARCHIVE_LOOP:
464 for (my $i = 0; $i < $count; $i++) {
465
466 # Split path information
467 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
468
Akrone1dbc382016-07-08 22:24:52 +0200469 my $filename = catfile(
470 $output,
471 get_file_name(
472 catfile($corpus, $doc, $text)
473 . '.json' . ($gzip ? '.gz' : '')
474 )
475 );
Akron941c1a62016-02-23 17:41:41 +0100476
477 # Get the next fork
478 my $pid = $pool->start and next ARCHIVE_LOOP;
479
480 # Create temporary file
481 my $temp = File::Temp->newdir;
482
483 my $msg;
484
485 # Extract from archive
486 if ($archive->extract($dirs[$i], $temp)) {
487
488 # Create corpus directory
Akron08385f62016-03-22 20:37:04 +0100489 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100490
491 # Temporary directory
492 my $dir = catdir($input, $doc, $text);
493
494 # Write file
Akrone1dbc382016-07-08 22:24:52 +0200495 $msg = $batch_file->process($dir => $output);
Akron941c1a62016-02-23 17:41:41 +0100496
497 $temp = undef;
498 $pool->finish(0, \$msg);
499 }
500 else {
501
502 $temp = undef;
503 $msg = "Unable to extract " . $dirs[$i] . "\n";
504 $pool->finish(1, \$msg);
505 };
506 };
507 }
508
509 else {
510 print "Input is neither a directory nor an archive.\n\n";
511 };
512
513 $pool->wait_all_children;
514
Akron11c80302016-03-18 19:44:43 +0100515 # Delete cache file
516 unlink($cache_file) if $cache_delete;
517
Akron941c1a62016-02-23 17:41:41 +0100518 print "Done.\n";
519 print timestr(timediff(Benchmark->new, $t))."\n\n";
520}
521
522# Unknown command
523else {
524 warn "Unknown command '$cmd'.\n\n";
525 pod2usage(%ERROR_HASH);
526}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000527
528__END__
Akron941c1a62016-02-23 17:41:41 +0100529
530=pod
531
532=encoding utf8
533
534=head1 NAME
535
Akronf7ad89e2016-03-16 18:22:47 +0100536korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100537
538
539=head1 SYNOPSIS
540
Akronc13a1702016-03-15 19:33:14 +0100541 $ korapxml2krill -z --input <directory> --output <filename>
542 $ korapxml2krill archive -z --input <directory> --output <directory>
543 $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
Akron941c1a62016-02-23 17:41:41 +0100544
545
546=head1 DESCRIPTION
547
548L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
549compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100550The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100551
552
553=head1 INSTALLATION
554
555The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
556
557 $ cpanm https://github.com/KorAP/KorAP-XML-Krill
558
Akronc13a1702016-03-15 19:33:14 +0100559In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100560be available on your command line immediately.
Akron941c1a62016-02-23 17:41:41 +0100561
562
563=head1 ARGUMENTS
564
565=over 2
566
567=item B<archive>
568
Akrone10ad322016-02-27 10:54:26 +0100569Process an archive as a Zip-file or a folder of KorAP-XML documents.
570
571=item B<extract>
572
573Extract KorAP-XML files from a Zip-file.
Akron941c1a62016-02-23 17:41:41 +0100574
575=back
576
577
578=head1 OPTIONS
579
580=over 2
581
Akron2cfe8092016-06-24 17:48:49 +0200582=item B<--input|-i> <directory|file|files>
Akron941c1a62016-02-23 17:41:41 +0100583
Akronf7ad89e2016-03-16 18:22:47 +0100584Directory or archive file of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100585
Akron0c3e3752016-06-28 15:55:53 +0200586Archiving supports multiple input archives with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200587that the first archive listed contains all primary data files
588and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200589
590 -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
591
Akron0c3e3752016-06-28 15:55:53 +0200592(The directory structure follows the base directory format,
593that may include a C<.> root folder.
594In this case further archives lacking a C<.> root folder
595need to be passed with a hash sign in front of the archive's name.)
Akron2cfe8092016-06-24 17:48:49 +0200596
Akron941c1a62016-02-23 17:41:41 +0100597=item B<--output|-o> <directory|file>
598
599Output folder for archive processing or
600document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100601writes to C<STDOUT> by default
602(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100603
604=item B<--overwrite|-w>
605
606Overwrite files that already exist.
607
608=item B<--token|-t> <foundry>[#<file>]
609
610Define the default tokenization by specifying
611the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100612of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100613
614=item B<--skip|-s> <foundry>[#<layer>]
615
Akronf7ad89e2016-03-16 18:22:47 +0100616Skip specific annotations by specifying the foundry
617(and optionally the layer with a C<#>-prefix),
618e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100619Can be set multiple times.
620
Akronc13a1702016-03-15 19:33:14 +0100621=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100622
Akronf7ad89e2016-03-16 18:22:47 +0100623Convert specific annotations by specifying the foundry
624(and optionally the layer with a C<#>-prefix),
625e.g. C<Mate> or C<Mate#Morpho>.
626Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100627
628=item B<--primary|-p>
629
Akronc13a1702016-03-15 19:33:14 +0100630Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100631Can be flagged using C<--no-primary> as well.
632This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100633
634=item B<--jobs|-j>
635
636Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100637for archive processing.
Akron11c80302016-03-18 19:44:43 +0100638Defaults to C<0> (everything runs in a single process).
Akronf7ad89e2016-03-16 18:22:47 +0100639This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100640
Akron35db6e32016-03-17 22:42:22 +0100641=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100642
Akron35db6e32016-03-17 22:42:22 +0100643Define the metadata parser to use. Defaults to C<I5>.
644Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
645This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100646
647=item B<--pretty|-y>
648
Akronc13a1702016-03-15 19:33:14 +0100649Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100650This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100651
652=item B<--gzip|-z>
653
Akronf7ad89e2016-03-16 18:22:47 +0100654Compress the output.
655Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100656
Akron11c80302016-03-18 19:44:43 +0100657=item B<--cache|-c>
658
659File to mmap a cache (using L<Cache::FastMmap>).
660Defaults to C<korapxml2krill.cache> in the calling directory.
661
662=item B<--cache-size|-cs>
663
664Size of the cache. Defaults to C<50m>.
665
666=item B<--cache-init|-ci>
667
668Initialize cache file.
669Can be flagged using C<--no-cache-init> as well.
670Defaults to C<true>.
671
672=item B<--cache-delete|-cd>
673
674Delete cache file after processing.
675Can be flagged using C<--no-cache-delete> as well.
676Defaults to C<true>.
677
Akrone10ad322016-02-27 10:54:26 +0100678=item B<--sigle|-sg>
679
680Extract the given text sigles.
Akrone10ad322016-02-27 10:54:26 +0100681Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +0100682I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +0200683Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akrone10ad322016-02-27 10:54:26 +0100684
Akron941c1a62016-02-23 17:41:41 +0100685=item B<--log|-l>
686
687The L<Log4perl> log level, defaults to C<ERROR>.
688
689=item B<--help|-h>
690
691Print this document.
692
693=item B<--version|-v>
694
695Print version information.
696
697=back
698
Akronc13a1702016-03-15 19:33:14 +0100699=head1 ANNOTATION SUPPORT
700
701L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
702developed in the KorAP project that are part of the KorAP preprocessing pipeline.
703The base foundry with paragraphs, sentences, and the text element are mandatory for
704L<Krill|https://github.com/KorAP/Krill>.
705
Akronf7ad89e2016-03-16 18:22:47 +0100706=over 2
Akronc13a1702016-03-15 19:33:14 +0100707
708=item B<Base>
709
710=over 4
711
Akronf7ad89e2016-03-16 18:22:47 +0100712=item #Paragraphs
Akronc13a1702016-03-15 19:33:14 +0100713
Akronf7ad89e2016-03-16 18:22:47 +0100714=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100715
716=back
717
718=item B<Connexor>
719
720=over 4
721
Akronf7ad89e2016-03-16 18:22:47 +0100722=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100723
Akronf7ad89e2016-03-16 18:22:47 +0100724=item #Phrase
Akronc13a1702016-03-15 19:33:14 +0100725
Akronf7ad89e2016-03-16 18:22:47 +0100726=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100727
Akronf7ad89e2016-03-16 18:22:47 +0100728=item #Syntax
Akronc13a1702016-03-15 19:33:14 +0100729
730=back
731
732=item B<CoreNLP>
733
734=over 4
735
Akronf7ad89e2016-03-16 18:22:47 +0100736=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100737
Akronf7ad89e2016-03-16 18:22:47 +0100738=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100739
Akronf7ad89e2016-03-16 18:22:47 +0100740=item #NamedEntities
Akronc13a1702016-03-15 19:33:14 +0100741
Akronf7ad89e2016-03-16 18:22:47 +0100742=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100743
744=back
745
746=item B<DeReKo>
747
748=over 4
749
Akronf7ad89e2016-03-16 18:22:47 +0100750=item #Structure
Akronc13a1702016-03-15 19:33:14 +0100751
752=back
753
754=item B<Glemm>
755
756=over 4
757
Akronf7ad89e2016-03-16 18:22:47 +0100758=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100759
760=back
761
762=item B<Mate>
763
764=over 4
765
Akronf7ad89e2016-03-16 18:22:47 +0100766=item #Dependency
Akronc13a1702016-03-15 19:33:14 +0100767
Akronf7ad89e2016-03-16 18:22:47 +0100768=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100769
770=back
771
772=item B<OpenNLP>
773
774=over 4
775
Akronf7ad89e2016-03-16 18:22:47 +0100776=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100777
Akronf7ad89e2016-03-16 18:22:47 +0100778=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100779
780=back
781
782=item B<Sgbr>
783
784=over 4
785
Akronf7ad89e2016-03-16 18:22:47 +0100786=item #Lemma
Akronc13a1702016-03-15 19:33:14 +0100787
Akronf7ad89e2016-03-16 18:22:47 +0100788=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100789
790=back
791
792=item B<TreeTagger>
793
794=over 4
795
Akronf7ad89e2016-03-16 18:22:47 +0100796=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100797
Akronf7ad89e2016-03-16 18:22:47 +0100798=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100799
800=back
801
802=item B<XIP>
803
804=over 4
805
Akronf7ad89e2016-03-16 18:22:47 +0100806=item #Constituency
Akronc13a1702016-03-15 19:33:14 +0100807
Akronf7ad89e2016-03-16 18:22:47 +0100808=item #Morpho
Akronc13a1702016-03-15 19:33:14 +0100809
Akronf7ad89e2016-03-16 18:22:47 +0100810=item #Sentences
Akronc13a1702016-03-15 19:33:14 +0100811
812=back
813
814=back
815
816More importers are in preparation.
817New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
818See the built-in annotation importers as examples.
819
Akron941c1a62016-02-23 17:41:41 +0100820=head1 AVAILABILITY
821
822 https://github.com/KorAP/KorAP-XML-Krill
823
824
825=head1 COPYRIGHT AND LICENSE
826
827Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +0100828
Akron941c1a62016-02-23 17:41:41 +0100829Author: L<Nils Diewald|http://nils-diewald.de/>
830
831L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
832Corpus Analysis Platform at the
833L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
834member of the
835L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
836
837This program is free software published under the
838L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
839
840=cut