blob: 42292ad6c8c670690abfcbe5629dbaf75e36663b [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
use File::Spec::Functions qw/catfile catdir/;
use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
use Benchmark qw/:hireswallclock/;
use IO::Compress::Gzip qw/$GzipError/;
use Log::Log4perl;
use Pod::Usage;
use Cache::FastMmap;
use Directory::Iterator;
use KorAP::XML::Krill;
use KorAP::XML::Archive;
use KorAP::XML::Tokenizer;
use KorAP::XML::Batch::File;
use Parallel::ForkManager;
# TODO: use Parallel::Loops
# TODO: make output files
# CHANGES:
# ----------------------------------------------------------
# 2013/11/25
# - Initial release
#
# 2014/10/29
# - Merges foundry data to create indexer friendly documents
#
# 2016/02/04
# - renamed to korapxml2krill
# - added Schreibgebrauch support
#
# 2016/02/12
# - fixed foundry skipping
# - Support overwrite in archive processing
#
# 2016/02/14
# - Added version information
# - Added support for archive files
#
# 2016/02/15
# - Fixed temporary directory bug
# - Improved skipping before unzipping
# - Added EXPERIMENTAL concurrency support
#
# 2016/02/23
# - Merge korapxml2krill and korapxml2krill_dir
#
# 2016/02/27
# - Added extract function
#
# 2016/03/17
# - Added meta switch
#
# 2016/03/18
# - Added meta data caching
#
# 2016/06/27
# - Added multi archive support
# - Added prefix negation support
# - Added Malt#Dependency support
#
# 2016/07/06
# - Added MDParser#Dependency
#
# 2016/10/15
# - Fixed temporary path issue in script
#
# 2016/10/24
# - Improved Windows support
#
# ----------------------------------------------------------
our $LAST_CHANGE = '2016/10/24';
our $LOCAL = $FindBin::Bin;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
VERSION
# Parse comand
my $cmd;
our @ARGV;
if ($ARGV[0] && index($ARGV[0], '-') != 0) {
$cmd = shift @ARGV;
};
my (@skip, @sigle, @anno, @input);
my $text;
# Parse options from the command line
GetOptions(
'input|i=s' => \@input,
'output|o=s' => \(my $output),
'overwrite|w' => \(my $overwrite),
'meta|m=s' => \(my $meta),
'token|t=s' => \(my $token_base = 'OpenNLP#tokens'),
'gzip|z' => \(my $gzip),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
'log|l=s' => \(my $log_level = 'ERROR'),
'anno|a=s' => \@anno,
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs = 0),
'cache-size|cs=s' => \(my $cache_size = '50m'),
'cache-delete|cd!' => \(my $cache_delete = 1),
'cache-init|ci!' => \(my $cache_init = 1),
'help|h' => sub {
pod2usage(
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
-msg => $VERSION_MSG,
-output => '-'
);
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG,
-output => '-'
)
}
);
my %ERROR_HASH = (
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
-msg => $VERSION_MSG,
-output => '-',
-exit => 1
);
# Input has to be defined
pod2usage(%ERROR_HASH) unless @input;
# Gzip has no effect, if no output is given
pod2usage(%ERROR_HASH) if $gzip && !$output;
# Initialize log4perl object
Log::Log4perl->init({
'log4perl.rootLogger' => uc($log_level) . ', STDERR',
'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
'log4perl.appender.STDERR.layout' => 'PatternLayout',
'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
});
my $log = Log::Log4perl->get_logger('main');
my %skip;
$skip{lc($_)} = 1 foreach @skip;
my @layers;
push(@layers, ['Base', 'Sentences']);
push(@layers, ['Base', 'Paragraphs']);
# Connexor
push(@layers, ['Connexor', 'Morpho']);
push(@layers, ['Connexor', 'Syntax']);
push(@layers, ['Connexor', 'Phrase']);
push(@layers, ['Connexor', 'Sentences']);
# CoreNLP
push(@layers, ['CoreNLP', 'NamedEntities']);
push(@layers, ['CoreNLP', 'Sentences']);
push(@layers, ['CoreNLP', 'Morpho']);
push(@layers, ['CoreNLP', 'Constituency']);
# DeReKo
push(@layers, ['DeReKo', 'Structure']);
# Glemm
push(@layers, ['Glemm', 'Morpho']);
# Malt
push(@layers, ['Malt', 'Dependency']);
# MDParser
push(@layers, ['MDParser', 'Dependency']);
# Mate
push(@layers, ['Mate', 'Morpho']);
push(@layers, ['Mate', 'Dependency']);
# OpenNLP
push(@layers, ['OpenNLP', 'Morpho']);
push(@layers, ['OpenNLP', 'Sentences']);
# Schreibgebrauch
push(@layers, ['Sgbr', 'Lemma']);
push(@layers, ['Sgbr', 'Morpho']);
# TreeTagger
push(@layers, ['TreeTagger', 'Morpho']);
push(@layers, ['TreeTagger', 'Sentences']);
# XIP
push(@layers, ['XIP', 'Morpho']);
push(@layers, ['XIP', 'Constituency']);
push(@layers, ['XIP', 'Sentences']);
push(@layers, ['XIP', 'Dependency']);
# Check filters
my @filtered_anno;
if ($skip{'#all'}) {
foreach (@anno) {
push @filtered_anno, [ split('#', $_) ];
};
}
# Add all annotations that are not skipped
else {
# Add to index file - respect skipping
foreach my $info (@layers) {
# Skip if Foundry or Foundry#Layer should be skipped
unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
push @filtered_anno, $info;
};
};
};
# Get tokenization basis
my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
# TODO: This should not be initialized for batch
my $cache = Cache::FastMmap->new(
share_file => $cache_file,
cache_size => $cache_size,
init_file => $cache_init
);
# Create batch object
my $batch_file = KorAP::XML::Batch::File->new(
cache => $cache,
meta_type => $meta,
overwrite => $overwrite,
foundry => $token_base_foundry,
layer => $token_base_layer,
gzip => $gzip,
log => $log,
primary => $primary,
pretty => $pretty,
anno => \@filtered_anno
);
# Get file name based on path information
sub get_file_name ($) {
my $i = $input[0];
if (-d $i) {
$i =~ s![^\/]+$!!;
};
my $file = shift;
# Remove temp dir fragments
$file =~ s!^/?tmp/[^/]+!!;
$file =~ s/^?\/?$i//;
$file =~ tr/\//-/;
$file =~ s{^-+}{};
return $file;
};
# Convert sigle to path construct
s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
if ($cmd) {
if ($output && (!-e $output || !-d $output)) {
print "Directory '$output' does not exist.\n\n";
exit(0);
};
};
# Process a single file
unless ($cmd) {
my $input = $input[0];
BEGIN {
$main::TIME = Benchmark->new;
$main::LAST_STOP = Benchmark->new;
};
sub stop_time {
my $new = Benchmark->new;
$log->info(
'The code took: '.
timestr(timediff($new, $main::LAST_STOP)) .
' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
);
$main::LAST_STOP = $new;
};
# Create and parse new document
$input =~ s{([^/])$}{$1/};
# Process file
$batch_file->process($input, $output);
# Delete cache file
unlink($cache_file) if $cache_delete;
stop_time;
}
# Extract XML files
elsif ($cmd eq 'extract') {
# Create new archive object
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
# Check zip capabilities
unless ($archive->test_unzip) {
print "Unzip is not installed or incompatible.\n\n";
exit(1);
};
# Add further annotation archived
$archive->attach($_) foreach @input;
my $prefix = 1;
# No sigles given
unless (@sigle) {
# Get files
foreach ($archive->list_texts) {
# Split path information
($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
# TODO: Make this OS independent
push @sigle, join '/', $corpus, $doc, $text;
};
};
# Iterate over all given sigles and extract
foreach (@sigle) {
print "$_ ";
# TODO: Make this OS independent
print '' . (
$archive->extract(
($prefix ? './' : '') . $_, $output
) ? '' : 'not '
);
print "extracted.\n";
};
print "\n";
exit(1);
}
# Can't create archive object
else {
$log->error('Unable to extract from primary archive ' . $input[0]);
};
}
# Process an archive
elsif ($cmd eq 'archive') {
# TODO: Support sigles
# Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
my $count = 0; # Texts to process
my $iter = 1; # Current text in process
# Report on fork message
$pool->run_on_finish (
sub {
my ($pid, $code) = @_;
my $data = pop;
print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
($iter++) . "/$count]" .
($code ? " $code" : '') .
' ' . $data->[0] . "\n";
$data->[1] = undef if $data->[1];
}
);
my $t;
my $temp;
print "Reading data ...\n";
# unless (Cache::FastMmap->new(
# share_file => $cache_file,
# cache_size => $cache_size,
# init_file => $cache_init
# )) {
# print "Unable to intialize cache '$cache_file'\n\n";
# exit(1);
# };
# Input is a directory
if (-d $input[0]) {
my $it = Directory::Iterator->new($input[0]);
my @dirs;
my $dir;
# Todo: Make a DO WHILE
while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
push @dirs, $dir;
$it->prune;
};
last unless $it->next;
};
print "Start processing ...\n";
$t = Benchmark->new;
$count = scalar @dirs;
DIRECTORY_LOOP:
for (my $i = 0; $i < $count; $i++) {
my $filename = catfile(
$output,
get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
);
# Get the next fork
$pool->start and next DIRECTORY_LOOP;
if ($batch_file->process($dirs[$i] => $filename)) {
$pool->finish(0, ["Processed " . $filename]);
}
else {
$pool->finish(1, ["Unable to process " . $dirs[$i]]);
};
};
}
# Input is a file
elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
unless ($archive->test_unzip) {
print "Unzip is not installed or incompatible.\n\n";
exit(1);
};
# Add further annotation archived
$archive->attach($_) foreach @input;
print "Start processing ...\n";
$t = Benchmark->new;
my @dirs = $archive->list_texts;
$count = scalar @dirs;
ARCHIVE_LOOP:
for (my $i = 0; $i < $count; $i++) {
# Split path information
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
my $filename = catfile(
$output,
get_file_name(
catfile($corpus, $doc, $text)
. '.json' . ($gzip ? '.gz' : '')
)
);
# Get the next fork
$pool->start and next ARCHIVE_LOOP;
# Create temporary file
$temp = File::Temp->newdir;
# Extract from archive
if ($archive->extract($dirs[$i], $temp)) {
# Create corpus directory
my $input = catdir("$temp", $corpus);
# Temporary directory
my $dir = catdir($input, $doc, $text);
# Write file
if ($batch_file->process($dir => $filename)) {
# Delete temporary file
$pool->finish(0, ["Processed " . $filename, $temp]);
}
else {
# Delete temporary file
$pool->finish(1, ["Unable to process " . $dir, $temp]);
};
}
# Unable to extract
else {
$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
};
};
}
else {
print "Input is neither a directory nor an archive.\n\n";
};
$pool->wait_all_children;
# Delete cache file
unlink($cache_file) if $cache_delete;
print "Done.\n";
print timestr(timediff(Benchmark->new, $t))."\n\n";
}
# Unknown command
else {
warn "Unknown command '$cmd'.\n\n";
pod2usage(%ERROR_HASH);
}
__END__
=pod
=encoding utf8
=head1 NAME
korapxml2krill - Merge KorapXML data and create Krill documents
=head1 SYNOPSIS
$ korapxml2krill -z --input <directory> --output <filename>
$ korapxml2krill archive -z --input <directory> --output <directory>
$ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>
=head1 DESCRIPTION
L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
The C<korapxml2krill> command line tool is a simple wrapper to the library.
=head1 INSTALLATION
The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
In case everything went well, the C<korapxml2krill> tool will
be available on your command line immediately.
Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
=head1 ARGUMENTS
=over 2
=item B<archive>
Process an archive as a Zip-file or a folder of KorAP-XML documents.
=item B<extract>
Extract KorAP-XML files from a Zip-file.
=back
=head1 OPTIONS
=over 2
=item B<--input|-i> <directory|file|files>
Directory or archive file of documents to convert.
Archiving supports multiple input archives with the constraint,
that the first archive listed contains all primary data files
and all meta data files.
-i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip
(The directory structure follows the base directory format,
that may include a C<.> root folder.
In this case further archives lacking a C<.> root folder
need to be passed with a hash sign in front of the archive's name.)
B<The root folder switch is experimental and may vanish in future versions.>
=item B<--output|-o> <directory|file>
Output folder for archive processing or
document name for single output (optional),
writes to C<STDOUT> by default
(in case C<output> is not mandatory due to further options).
=item B<--overwrite|-w>
Overwrite files that already exist.
=item B<--token|-t> <foundry>[#<file>]
Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to C<OpenNLP#tokens>.
=item B<--skip|-s> <foundry>[#<layer>]
Skip specific annotations by specifying the foundry
(and optionally the layer with a C<#>-prefix),
e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Can be set multiple times.
=item B<--anno|-a> <foundry>#<layer>
Convert specific annotations by specifying the foundry
(and optionally the layer with a C<#>-prefix),
e.g. C<Mate> or C<Mate#Morpho>.
Can be set multiple times.
=item B<--primary|-p>
Output primary data or not. Defaults to C<true>.
Can be flagged using C<--no-primary> as well.
This is I<deprecated>.
=item B<--jobs|-j>
Define the number of concurrent jobs in seperated forks
for archive processing.
Defaults to C<0> (everything runs in a single process).
This is I<experimental>.
=item B<--meta|-m>
Define the metadata parser to use. Defaults to C<I5>.
Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
This is I<experimental>.
=item B<--pretty|-y>
Pretty print JSON output. Defaults to C<false>.
This is I<deprecated>.
=item B<--gzip|-z>
Compress the output.
Expects a defined C<output> file in single processing.
=item B<--cache|-c>
File to mmap a cache (using L<Cache::FastMmap>).
Defaults to C<korapxml2krill.cache> in the calling directory.
=item B<--cache-size|-cs>
Size of the cache. Defaults to C<50m>.
=item B<--cache-init|-ci>
Initialize cache file.
Can be flagged using C<--no-cache-init> as well.
Defaults to C<true>.
=item B<--cache-delete|-cd>
Delete cache file after processing.
Can be flagged using C<--no-cache-delete> as well.
Defaults to C<true>.
=item B<--sigle|-sg>
Extract the given text sigles.
Can be set multiple times.
I<Currently only supported on C<extract>.>
Sigles have the structure C<Corpus>/C<Document>/C<Text>.
=item B<--log|-l>
The L<Log4perl> log level, defaults to C<ERROR>.
=item B<--help|-h>
Print this document.
=item B<--version|-v>
Print version information.
=back
=head1 ANNOTATION SUPPORT
L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
developed in the KorAP project that are part of the KorAP preprocessing pipeline.
The base foundry with paragraphs, sentences, and the text element are mandatory for
L<Krill|https://github.com/KorAP/Krill>.
=over 2
=item B<Base>
=over 4
=item #Paragraphs
=item #Sentences
=back
=item B<Connexor>
=over 4
=item #Morpho
=item #Phrase
=item #Sentences
=item #Syntax
=back
=item B<CoreNLP>
=over 4
=item #Constituency
=item #Morpho
=item #NamedEntities
=item #Sentences
=back
=item B<DeReKo>
=over 4
=item #Structure
=back
=item B<Glemm>
=over 4
=item #Morpho
=back
=item B<Mate>
=over 4
=item #Dependency
=item #Morpho
=back
=item B<OpenNLP>
=over 4
=item #Morpho
=item #Sentences
=back
=item B<Sgbr>
=over 4
=item #Lemma
=item #Morpho
=back
=item B<TreeTagger>
=over 4
=item #Morpho
=item #Sentences
=back
=item B<XIP>
=over 4
=item #Constituency
=item #Morpho
=item #Sentences
=back
=back
More importers are in preparation.
New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
See the built-in annotation importers as examples.
=head1 AVAILABILITY
https://github.com/KorAP/KorAP-XML-Krill
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>
Author: L<Nils Diewald|http://nils-diewald.de/>
L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
This program is free software published under the
L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
=cut