blob: 03a808893243dba53cb01ebfe1a81797e7a59eca [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
use File::Spec::Functions qw/catfile catdir/;
use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
use Benchmark qw/:hireswallclock/;
use IO::Compress::Gzip qw/$GzipError/;
use Log::Log4perl;
use Pod::Usage;
use Directory::Iterator;
use KorAP::XML::Krill;
use KorAP::XML::Archive;
use KorAP::XML::Tokenizer;
use Parallel::ForkManager;
# CHANGES:
# ----------------------------------------------------------
# 2013/11/25
# - Initial release
#
# 2014/10/29
# - Merges foundry data to create indexer friendly documents
#
# 2016/02/04
# - renamed to korapxml2krill
# - added Schreibgebrauch support
#
# 2016/02/12
# - fixed foundry skipping
# - Support overwrite in archive processing
#
# 2016/02/14
# - Added version information
# - Added support for archive files
#
# 2016/02/15
# - Fixed temporary directory bug
# - Improved skipping before unzipping
# - Added EXPERIMENTAL concurrency support
#
# 2016/02/23
# - Merge korapxml2krill and korapxml2krill_dir
# ----------------------------------------------------------
our $LAST_CHANGE = '2016/02/23';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
VERSION
# Parse comand
my $cmd;
our @ARGV;
if ($ARGV[0] && index($ARGV[0], '-') != 0) {
$cmd = shift @ARGV;
};
# Parse options from the command line
GetOptions(
'input|i=s' => \(my $input),
'output|o=s' => \(my $output),
'overwrite|w' => \(my $overwrite),
'human|m' => \(my $text),
'token|t=s' => \(my $token_base),
'gzip|z' => \(my $gzip),
'skip|s=s' => \(my @skip),
'log|l=s' => \(my $log_level = 'ERROR'),
'allow|a=s' => \(my @allow),
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs = 0),
'help|h' => sub {
pod2usage(
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
-msg => $VERSION_MSG,
);
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG
)
}
);
my %ERROR_HASH = (
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
-msg => $VERSION_MSG,
-exit => 1
);
# Input has to be defined
pod2usage(%ERROR_HASH) unless $input;
# Initialize log4perl object
Log::Log4perl->init({
'log4perl.rootLogger' => uc($log_level) . ', STDERR',
'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
'log4perl.appender.STDERR.layout' => 'PatternLayout',
'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
});
my $log = Log::Log4perl->get_logger('main');
# Get file name based on path information
sub get_file_name ($) {
my $file = shift;
$file =~ s/^?\/?$input//;
$file =~ tr/\//-/;
$file =~ s{^-+}{};
return $file;
};
# Write file
sub write_file {
my $anno = shift;
my $file = get_file_name $anno;
# TODO: This should be done directly with a data structure! KorAP::XML::Wrap
my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
$anno . ' -o ' . $output . '/' . $file . '.json';
$call .= '.gz -z' if $gzip;
$call .= ' -m' if $text;
$call .= ' -w' if $overwrite;
$call .= ' -t ' . $token_base if $token_base;
$call .= ' -l ' . $log_level if $log_level;
$call .= ' --no-primary ' if $primary;
$call .= ' -y ' . $pretty if $pretty;
$call .= ' -a ' . $_ foreach @allow;
$call .= ' -s ' . $_ foreach @skip;
system($call);
return "$file";
};
# Process a single file
unless ($cmd) {
# Can't print gzip to STDOUT
pod2usage(%ERROR_HASH) if $gzip && !$output;
my %skip;
$skip{lc($_)} = 1 foreach @skip;
# Ignore processing
if (!$overwrite && $output && -e $output) {
$log->trace($output . ' already exists');
exit(0);
};
BEGIN {
$main::TIME = Benchmark->new;
$main::LAST_STOP = Benchmark->new;
};
sub stop_time {
my $new = Benchmark->new;
$log->trace(
'The code took: '.
timestr(timediff($new, $main::LAST_STOP)) .
' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
);
$main::LAST_STOP = $new;
};
# Create and parse new document
$input =~ s{([^/])$}{$1/};
my $doc = KorAP::XML::Krill->new( path => $input );
unless ($doc->parse) {
$log->warn($output . " can't be processed - no document data");
exit(0);
};
my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
if ($token_base) {
($token_base_foundry, $token_base_layer) = split /#/, $token_base;
};
# Get tokenization
my $tokens = KorAP::XML::Tokenizer->new(
path => $doc->path,
doc => $doc,
foundry => $token_base_foundry,
layer => $token_base_layer,
name => 'tokens'
);
# Unable to process base tokenization
unless ($tokens->parse) {
$log->error($output . " can't be processed - no base tokenization");
exit(0);
};
my @layers;
push(@layers, ['Base', 'Sentences']);
push(@layers, ['Base', 'Paragraphs']);
# Connexor
push(@layers, ['Connexor', 'Morpho']);
push(@layers, ['Connexor', 'Syntax']);
push(@layers, ['Connexor', 'Phrase']);
push(@layers, ['Connexor', 'Sentences']);
# CoreNLP
push(@layers, ['CoreNLP', 'NamedEntities']);
push(@layers, ['CoreNLP', 'Sentences']);
push(@layers, ['CoreNLP', 'Morpho']);
push(@layers, ['CoreNLP', 'Constituency']);
# DeReKo
push(@layers, ['DeReKo', 'Structure']);
# Glemm
push(@layers, ['Glemm', 'Morpho']);
# Malt
# push(@layers, ['Malt', 'Dependency']);
# Mate
push(@layers, ['Mate', 'Morpho']);
push(@layers, ['Mate', 'Dependency']);
# OpenNLP
push(@layers, ['OpenNLP', 'Morpho']);
push(@layers, ['OpenNLP', 'Sentences']);
# Schreibgebrauch
push(@layers, ['Sgbr', 'Lemma']);
push(@layers, ['Sgbr', 'Morpho']);
# TreeTagger
push(@layers, ['TreeTagger', 'Morpho']);
push(@layers, ['TreeTagger', 'Sentences']);
# XIP
push(@layers, ['XIP', 'Morpho']);
push(@layers, ['XIP', 'Constituency']);
push(@layers, ['XIP', 'Sentences']);
push(@layers, ['XIP', 'Dependency']);
if ($skip{'#all'}) {
foreach (@allow) {
$tokens->add(split('#', $_));
stop_time;
};
}
else {
# Add to index file - respect skipping
foreach my $info (@layers) {
# Skip if Foundry or Foundry#Layer should be skipped
unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
$tokens->add(@$info);
stop_time;
};
};
};
my $file;
my $print_text = $text ? $tokens->to_string($primary) :
($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
if ($output) {
if ($gzip) {
$file = IO::Compress::Gzip->new($output, Minimal => 1);
}
else {
$file = IO::File->new($output, "w");
};
$file->print($print_text);
$file->close;
}
else {
print $print_text . "\n";
};
stop_time;
}
# Process an archive
elsif ($cmd eq 'archive') {
pod2usage(%ERROR_HASH) unless $output;
if ($output && (!-e $output || !-d $output)) {
print "Directory '$output' does not exist.\n\n";
exit(0);
};
# Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
my $count = 0; # Texts to process
my $iter = 1; # Current text in process
# Report on fork message
$pool->run_on_finish (
sub {
my ($pid, $code) = shift;
my $data = pop;
print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
($iter++) . "/$count]" .
($code ? " $code" : '') .
" $$data\n";
}
);
my $t;
print "Reading data ...\n";
# Input is a directory
if (-d $input) {
my $it = Directory::Iterator->new($input);
my @dirs;
my $dir;
while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
push @dirs, $dir;
$it->prune;
};
last unless $it->next;
};
print "Start processing ...\n";
$t = Benchmark->new;
$count = scalar @dirs;
DIRECTORY_LOOP:
for (my $i = 0; $i < $count; $i++) {
unless ($overwrite) {
my $filename = catfile(
$output,
get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
);
if (-e $filename) {
$iter++;
print "Skip $filename\n";
next;
};
};
# Get the next fork
my $pid = $pool->start and next DIRECTORY_LOOP;
my $msg;
$msg = write_file($dirs[$i]);
$pool->finish(0, \$msg);
};
}
# Input is a file
elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
unless ($archive->test_unzip) {
print "Unzip is not installed or incompatible.\n\n";
exit(1);
};
unless ($archive->test) {
print "Zip archive not compatible.\n\n";
exit(1);
};
print "Start processing ...\n";
$t = Benchmark->new;
my @dirs = $archive->list_texts;
$count = scalar @dirs;
ARCHIVE_LOOP:
for (my $i = 0; $i < $count; $i++) {
# Split path information
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
unless ($overwrite) {
my $filename = catfile(
$output,
get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
);
if (-e $filename) {
$iter++;
print "Skip $filename\n";
next;
};
};
# Get the next fork
my $pid = $pool->start and next ARCHIVE_LOOP;
# Create temporary file
my $temp = File::Temp->newdir;
my $msg;
# Extract from archive
if ($archive->extract($dirs[$i], $temp)) {
# Create corpus directory
$input = catdir("$temp", $corpus);
# Temporary directory
my $dir = catdir($input, $doc, $text);
# Write file
$msg = write_file($dir);
$temp = undef;
$pool->finish(0, \$msg);
}
else {
$temp = undef;
$msg = "Unable to extract " . $dirs[$i] . "\n";
$pool->finish(1, \$msg);
};
};
}
else {
print "Input is neither a directory nor an archive.\n\n";
};
$pool->wait_all_children;
print "Done.\n";
print timestr(timediff(Benchmark->new, $t))."\n\n";
}
# Unknown command
else {
warn "Unknown command '$cmd'.\n\n";
pod2usage(%ERROR_HASH);
}
__END__
=pod
=encoding utf8
=head1 NAME
korapxml2krill - Merge KorapXML data and create Krill friendly documents
=head1 SYNOPSIS
$ korapxml2krill [archive] -z --input <directory> --output <filename>
=head1 DESCRIPTION
L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
=head1 INSTALLATION
The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
$ cpanm https://github.com/KorAP/KorAP-XML-Krill
In case everything went well, the C<korapxml2krill> command line tool will
be available.
=head1 ARGUMENTS
=over 2
=item B<archive>
Process an archive as a Zip-File or a folder of KorAP-XML documents.
=back
=head1 OPTIONS
=over 2
=item B<--input|-i> <directory|file>
Directory or archive file of documents to index.
=item B<--output|-o> <directory|file>
Output folder for archive processing or
document name for single output (optional),
writes to <STDOUT> by default.
=item B<--overwrite|-w>
Overwrite files that already exist.
=item B<--token|-t> <foundry>[#<file>]
Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to OpenNLP#tokens.
=item B<--skip|-s> <foundry>[#<layer>]
Skip specific foundries by specifying the name
or specific layers by defining the name
with a # in front of the foundry,
e.g. Mate#Morpho. Alternatively you can skip #ALL.
Can be set multiple times.
=item B<--allow|-a> <foundry>#<layer>
Allow specific foundries and layers by defining them
combining the foundry name with a # and the layer name.
=item B<--primary|-p>
Output primary data or not. Defaults to true.
Can be flagged using --no-primary as well.
=item B<--jobs|-j>
Define the number of concurrent jobs in seperated forks
for archive processing, defaults to 0. This is B<EXPERIMENTAL>!
=item B<--human|-m>
Represent the data human friendly, while the output defaults to JSON.
=item B<--pretty|-y>
Pretty print JSON output.
=item B<--gzip|-z>
Compress the output (expects a defined output file in single processing).
=item B<--log|-l>
The L<Log4perl> log level, defaults to C<ERROR>.
=item B<--help|-h>
Print this document.
=item B<--version|-v>
Print version information.
=back
=head1 AVAILABILITY
https://github.com/KorAP/KorAP-XML-Krill
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Author: L<Nils Diewald|http://nils-diewald.de/>
L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
This program is free software published under the
L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
=cut