blob: 5e5b3154c82507382728d8e7ffa7fd06d3dd72a8 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
use File::Spec::Functions qw/catfile catdir/;
use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
use Benchmark qw/:hireswallclock/;
use IO::Compress::Gzip qw/$GzipError/;
use POSIX qw/ceil/;
use Log::Log4perl;
use Pod::Usage;
use Cache::FastMmap;
use Directory::Iterator;
use KorAP::XML::Krill;
use KorAP::XML::Archive;
use KorAP::XML::Tokenizer;
use KorAP::XML::Batch::File;
use Config::Simple;
use Parallel::ForkManager;
use v5.10;
use Sys::Info;
use Sys::Info::Constants qw( :device_cpu );
use File::Glob ':bsd_glob';
use File::Temp qw/tempdir/;
use File::Path qw(remove_tree make_path);
use File::Basename;
use Mojo::Collection 'c';
use String::Random qw(random_string);
use IO::File;
use Archive::Tar::Builder;
use Fcntl qw(:flock SEEK_END);
# use KorAP::XML::ForkPool;
# TODO: use Parallel::Loops
# TODO: make output files
# TODO: Use KorAP::XML::ForkPool!
# CHANGES:
# ----------------------------------------------------------
# 2013/11/25
# - Initial release
#
# 2014/10/29
# - Merges foundry data to create indexer friendly documents
#
# 2016/02/04
# - renamed to korapxml2krill
# - added Schreibgebrauch support
#
# 2016/02/12
# - fixed foundry skipping
# - Support overwrite in archive processing
#
# 2016/02/14
# - Added version information
# - Added support for archive files
#
# 2016/02/15
# - Fixed temporary directory bug
# - Improved skipping before unzipping
# - Added EXPERIMENTAL concurrency support
#
# 2016/02/23
# - Merge korapxml2krill and korapxml2krill_dir
#
# 2016/02/27
# - Added extract function
#
# 2016/03/17
# - Added meta switch
#
# 2016/03/18
# - Added meta data caching
#
# 2016/06/27
# - Added multi archive support
# - Added prefix negation support
# - Added Malt#Dependency support
#
# 2016/07/06
# - Added MDParser#Dependency
#
# 2016/10/15
# - Fixed temporary path issue in script
#
# 2016/10/24
# - Improved Windows support
#
# 2016/10/24
# - Added support for document extraction
#
# 2016/10/27
# - Added wildcard support for document extraction
#
# 2016/12/21
# - added support for base-sentences and base-tokenizations
#
# 2017/01/20
# - added support for DRuKoLa annotations
#
# 2017/02/08
# - added support for pagebreak annotations
#
# 2017/04/06
# - added support for wildcards in input
#
# 2017/04/07
# - support configuration option
# - support for temporary extraction
#
# 2017/04/12
# - support serial processing
# - support input root
# - introduced --sequential-extraction flag
#
# 2017/06/19
# - added support for DCK
#
# 2017/06/29
# - Fixed exit codes
#
# 2017/07/04
# - Fixed tar building process
#
# 2018/01/16
# - Added LWC support
#
# 2018/07/19
# - Preliminary support for HNC.
#
# 2019/01/22
# - Support for non-word tokens.
#
# 2019/02/13
# - Support for 'koral:field' array.
# - Support for Koral versioning.
# - Ignore temporary extract parameter on
# directory archiving.
# ----------------------------------------------------------
our $LAST_CHANGE = '2019/02/07';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
VERSION
# Prototypes
sub get_file_name_from_glob($);
sub get_file_name($);
# Parse comand
my $cmd;
our @ARGV;
if ($ARGV[0] && index($ARGV[0], '-') != 0) {
$cmd = shift @ARGV;
};
my @keep_argv = @ARGV;
my (@skip, @sigle, @anno, @input);
my $text;
# Parse options from the command line
GetOptions(
'input|i=s' => \@input,
'input-base|ib=s' => \(my $input_base),
'output|o=s' => \(my $output),
'overwrite|w' => \(my $overwrite),
'meta|m=s' => \(my $meta),
'token|t=s' => \(my $token_base),
'base-sentences|bs=s' => \(my $base_sentences),
'base-paragraphs|bp=s' => \(my $base_paragraphs),
'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
'gzip|z' => \(my $gzip),
'temporary-extract|te=s' => \(my $extract_dir),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
'cache|c=s' => \(my $cache_file),
'config|cfg=s' => \(my $cfg_file),
'log|l=s' => \(my $log_level),
'anno|a=s' => \@anno,
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs),
'koral|k=f' => \(my $koral),
'to-tar' => \(my $to_tar),
'non-word-tokens|nwt' => \(my $non_word_tokens),
'sequential-extraction|se' => \(my $sequential_extraction),
'cache-size|cs=s' => \(my $cache_size),
'cache-delete|cd!' => \(my $cache_delete),
'cache-init|ci!' => \(my $cache_init),
'help|h' => sub {
pod2usage(
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
-msg => $VERSION_MSG,
-output => '-'
);
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG,
-output => '-'
)
}
);
# Load from configuration
if ($cfg_file && -e $cfg_file) {
my %config;
Config::Simple->import_from($cfg_file, \%config);
# Overwrite
if (!defined($overwrite) && defined $config{overwrite}) {
$overwrite = $config{overwrite};
};
# Gzip
if (!defined($gzip) && defined $config{gzip}) {
$gzip = $config{gzip};
};
# Jobs
if (!defined($jobs) && defined $config{jobs}) {
$jobs = $config{jobs};
};
# Koral version
if (!defined($koral) && defined $config{koral}) {
$koral = $config{koral};
};
# Input root base directory
if (!defined($input_base) && defined $config{'input-base'}) {
$input_base = $config{'input-base'};
};
# temporary-extract
if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
$extract_dir = $config{'temporary-extract'};
};
# Token base
if (!defined($token_base) && defined $config{token}) {
$token_base = $config{token};
};
# temporary-extract
if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
$non_word_tokens = $config{'non-word-tokens'};
};
# Cache file
if (!defined($cache_file) && defined $config{cache}) {
$cache_file = $config{cache};
};
# Cache size
if (!defined($cache_size) && defined $config{'cache-size'}) {
$cache_size = $config{'cache-size'};
};
# Cache delete
if (!defined($cache_delete) && defined $config{'cache-delete'}) {
$cache_delete = $config{'cache-delete'} ;
};
# Cache init
if (!(defined $cache_init) && defined $config{'cache-init'}) {
$cache_init = $config{'cache-init'} ;
};
# Jobs for extraction
if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
$sequential_extraction = $config{'sequential-extraction'} ;
};
# Meta
if (!(defined $meta) && defined $config{'meta'}) {
$meta = $config{'meta'} ;
};
# Output
if (!(defined $output) && defined $config{'output'}) {
$output = $config{'output'} ;
};
# Base-sentences
if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
$base_sentences = $config{'base-sentences'} ;
};
# Base-paragraphs
if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
$base_paragraphs = $config{'base-paragraphs'} ;
};
# Base-pagebreaks
if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
$base_pagebreaks = $config{'base-pagebreaks'} ;
};
# Write to tar
if (!(defined $to_tar) && defined $config{'to-tar'}) {
$to_tar = $config{'to-tar'} ;
};
# Log
if (!(defined $log_level) && defined $config{'log'}) {
$log_level = $config{'log'} ;
};
# Skip
if (!scalar(@skip) && defined $config{'skip'}) {
@skip = split /\s*;\s*/, $config{'skip'} ;
};
# Sigle
if (!scalar(@sigle) && defined $config{'sigle'}) {
@sigle = split /\s*;\s*/, $config{'sigle'} ;
};
# Anno
if (!scalar(@anno) && defined $config{'anno'}) {
@anno = split /\s*;\s*/, $config{'anno'} ;
};
};
# Set default token base
$token_base //= 'OpenNLP#tokens';
$cache_file //= 'korapxml2krill.cache';
$cache_size //= '50m';
$jobs //= 0;
$koral //= $KORAL_VERSION;
$cache_delete //= 1;
$cache_init //= 1;
$sequential_extraction //= 0;
$log_level //= 'ERROR';
$base_sentences //= '';
$base_paragraphs //= '';
$base_pagebreaks //= '';
$non_word_tokens //= 0;
$base_sentences = lc $base_sentences;
$base_paragraphs = lc $base_paragraphs;
$base_pagebreaks = lc $base_pagebreaks;
# Initialize log4perl object
Log::Log4perl->init({
'log4perl.rootLogger' => uc($log_level) . ', STDERR',
'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
'log4perl.appender.STDERR.layout' => 'PatternLayout',
'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
});
my $log = Log::Log4perl->get_logger('main');
print "Reading config from $cfg_file\n" if $cfg_file;
my %ERROR_HASH = (
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
-msg => $VERSION_MSG,
-output => '-',
-exit => 1
);
# Input has to be defined
pod2usage(%ERROR_HASH) unless @input;
# Gzip has no effect, if no output is given
pod2usage(%ERROR_HASH) if $gzip && !$output;
if ($jobs eq '-1') {
state $cores = Sys::Info->new->device('CPU')->count;
$jobs = ceil(5 * $cores);
$log->info("Run using $jobs jobs on $cores cores");
};
# Start serial processing
if ($cmd && $cmd eq 'serial') {
if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
$log->error("Directory '$output' does not exist.");
exit 1;
};
# Remove all inputs
my $remove_next = 0;
@keep_argv = @{c(@keep_argv)->grep(
sub {
# Input flag
if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
$remove_next = 1;
return 0;
}
# input value
elsif ($remove_next) {
$remove_next = 0;
return 0;
};
# Pass parameter
return 1;
}
)->to_array};
# Iterate over all inputs
foreach (@input) {
# This will create a directory
my $new_out = catdir($output, get_file_name_from_glob($_));
# Create new path, in case the output is not meant to be tarred
unless ($to_tar) {
if (make_path($new_out) == 0 && !-d $new_out) {
$log->error("Can\'t create path $new_out");
exit 1;
};
};
# Create archive command
my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
print "Start serial processing of $_ to $new_out\n";
# Start archiving
system @archive_cmd;
};
exit;
};
my %skip;
$skip{lc($_)} = 1 foreach @skip;
my @layers;
push(@layers, ['Base', 'Sentences']) unless $base_sentences;
push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
# Connexor
push(@layers, ['Connexor', 'Morpho']);
push(@layers, ['Connexor', 'Syntax']);
push(@layers, ['Connexor', 'Phrase']);
push(@layers, ['Connexor', 'Sentences']);
# CoreNLP
push(@layers, ['CoreNLP', 'NamedEntities']);
push(@layers, ['CoreNLP', 'Sentences']);
push(@layers, ['CoreNLP', 'Morpho']);
push(@layers, ['CoreNLP', 'Constituency']);
# CMC
push(@layers, ['CMC', 'Morpho']);
# DeReKo
my @dereko_attr = ();
if ($base_sentences eq 'dereko#structure') {
push @dereko_attr, 'sentences';
};
if ($base_paragraphs eq 'dereko#structure') {
push @dereko_attr, 'paragraphs';
};
if ($base_pagebreaks eq 'dereko#structure') {
push @dereko_attr, 'pagebreaks';
};
if ($dereko_attr[0]) {
push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
}
else {
push(@layers, ['DeReKo', 'Structure']);
};
# Glemm
push(@layers, ['Glemm', 'Morpho']);
# HNC
push(@layers, ['HNC', 'Morpho']);
# LWC
push(@layers, ['LWC', 'Dependency']);
# Malt
push(@layers, ['Malt', 'Dependency']);
# MDParser
push(@layers, ['MDParser', 'Dependency']);
# Mate
push(@layers, ['Mate', 'Morpho']);
push(@layers, ['Mate', 'Dependency']);
# OpenNLP
push(@layers, ['OpenNLP', 'Morpho']);
push(@layers, ['OpenNLP', 'Sentences']);
# Schreibgebrauch
push(@layers, ['Sgbr', 'Lemma']);
push(@layers, ['Sgbr', 'Morpho']);
# TreeTagger
push(@layers, ['TreeTagger', 'Morpho']);
push(@layers, ['TreeTagger', 'Sentences']);
# XIP
push(@layers, ['XIP', 'Morpho']);
push(@layers, ['XIP', 'Constituency']);
push(@layers, ['XIP', 'Sentences']);
push(@layers, ['XIP', 'Dependency']);
# DRuKoLa
push(@layers, ['DRuKoLa', 'Morpho']);
# Marmot
push(@layers, ['MarMoT', 'Morpho']);
# Check filters
my @filtered_anno;
if ($skip{'#all'}) {
foreach (@anno) {
push @filtered_anno, [ split('#', $_) ];
};
}
# Add all annotations that are not skipped
else {
# Add to index file - respect skipping
foreach my $info (@layers) {
# Skip if Foundry or Foundry#Layer should be skipped
unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
push @filtered_anno, $info;
};
};
};
# Get tokenization basis
my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
# Remove file extension
$token_base_layer =~ s/\.xml$//i;
# TODO: This should not be initialized for batch
my $cache = Cache::FastMmap->new(
share_file => $cache_file,
cache_size => $cache_size,
init_file => $cache_init
);
# Create batch object
my $batch_file = KorAP::XML::Batch::File->new(
cache => $cache,
meta_type => $meta,
overwrite => $overwrite,
foundry => $token_base_foundry,
layer => $token_base_layer,
gzip => $gzip,
log => $log,
koral => $koral,
primary => $primary,
pretty => $pretty,
anno => \@filtered_anno,
non_word_tokens => $non_word_tokens
);
# Get file name based on path information
sub get_file_name ($) {
my $i = $input[0];
if (-d $i) {
$i =~ s![^\/]+$!!;
};
my $file = shift;
# Remove temp dir fragments
$file =~ s!^/?tmp/[^/]+!!;
$file =~ s/^?\/?$i//;
$file =~ tr/\//-/;
$file =~ s{^-+}{};
$file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
return $file;
};
sub get_file_name_from_glob ($) {
my $glob = shift;
$glob =~ s![\\\/]!-!g; # Transform paths
$glob =~ s/[\*\?]//g; # Remove arbitrary fills
$glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
$glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
$glob =~ s/^-//; # Clean beginning
$glob =~ s/-$//; # Clean end
$glob =~ s/\.zip$//; # Remove file extension
return $glob;
};
# Convert sigle to path construct
s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
if ($cmd) {
if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
$log->error("Directory '$output' does not exist.");
exit 1;
};
};
# Glob and prefix files
if (@input) {
my @new_input = ();
# Iterate over all inputs
foreach my $wild_card (@input) {
# Prefix with input root
$wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
push (@new_input, bsd_glob($wild_card));
};
# Sort files by length
@input = sort { length($a) <=> length($b) } @new_input;
print 'Input is ' . join(', ', @input)."\n";
};
# Process a single file
unless ($cmd) {
my $input = $input[0];
BEGIN {
$main::TIME = Benchmark->new;
$main::LAST_STOP = Benchmark->new;
};
sub stop_time {
my $new = Benchmark->new;
$log->info(
'The code took: '.
timestr(timediff($new, $main::LAST_STOP)) .
' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
);
$main::LAST_STOP = $new;
};
# Create and parse new document
$input =~ s{([^/])$}{$1/};
# Process file
$batch_file->process($input, $output);
# Delete cache file
unlink($cache_file) if $cache_delete;
stop_time;
exit;
};
# Extract XML files
if ($cmd eq 'extract') {
# Output is required
pod2usage(%ERROR_HASH) unless $output;
# Create new archive object
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
# Check zip capabilities
unless ($archive->test_unzip) {
$log->error("Unzip is not installed or incompatible.");
exit 1;
};
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
# Will set @sigle
my $prefix = set_sigle($archive);
# my $prefix = 1;
#
# # No sigles given
# unless (@sigle) {
#
# # Get files
# foreach ($archive->list_texts) {
#
# # Split path information
# ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
#
# # TODO: Make this OS independent
# push @sigle, join '/', $corpus, $doc, $text;
# };
# }
#
# # Check sigle for doc sigles
# else {
# my @new_sigle;
#
# my $prefix_check = 0;
#
# # Iterate over all sigle
# foreach (@sigle) {
#
# # Sigle is a doc sigle
# if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
#
# print "$_ ...";
# # Check if a prefix is needed
# unless ($prefix_check) {
#
# if ($prefix = $archive->check_prefix) {
# print " with prefix ...";
# };
# $prefix_check = 1;
# };
#
# print "\n";
#
# # TODO: Make this OS independent
# my $path = ($prefix ? './' : '') . $_;
#
# print '... ' . (
# $archive->extract_doc(
# $path, $output, $sequential_extraction ? 1 : $jobs
# ) ? '' : 'not '
# );
# print "extracted.\n";
# }
#
# # Sigle is a text sigle
# else {
# push @new_sigle, $_;
#
# unless ($prefix_check) {
#
# if ($prefix = $archive->check_prefix) {
# print " with prefix ...";
# };
# $prefix_check = 1;
# };
# };
# };
# @sigle = @new_sigle;
# };
# Iterate over all given sigles and extract
foreach (@sigle) {
print "$_ ...\n";
# TODO: Make this OS independent
print '... ' . (
# TODO:
# - prefix???
$archive->extract_sigle([$_], $output, $jobs)
? '' : 'not '
);
print "extracted.\n";
};
}
# Can't create archive object
else {
$log->error('Unable to extract from primary archive ' . $input[0]);
exit 1;
};
}
# Process an archive
elsif ($cmd eq 'archive') {
my $archive_output;
# First extract, then archive
if (defined $extract_dir && !-d $input[0]) {
# Create new archive object
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
# Check zip capabilities
unless ($archive->test_unzip) {
$log->error("Unzip is not installed or incompatible.");
exit 1;
};
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
# Create a temporary directory
if ($extract_dir eq ':temp:') {
$extract_dir = tempdir(CLEANUP => 0);
print "Temporarily extract to $extract_dir\n";
};
# Add some random extra to avoid clashes with multiple archives
$extract_dir = catdir($extract_dir, random_string('cccccc'));
# Extract to temporary directory
if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
@input = ($extract_dir);
}
else {
$log->error('Unable to extract from primary archive ' . $input[0] .
' to ' . $extract_dir);
exit 1;
};
}
# Can't create archive object
else {
$log->error('Unable to extract from primary archive ' . $input[0]);
exit 1;
};
};
# Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
my $count = 0; # Texts to process
my $iter = 1; # Current text in process
my $tar_archive;
my $output_dir = $output;
my $tar_fh;
# Initialize tar archive
if ($to_tar) {
$tar_archive = Archive::Tar::Builder->new(
ignore_errors => 1
);
# Set output name
my $tar_file = $output;
unless ($tar_file =~ /\.tar$/) {
$tar_file .= '.tar';
};
# Initiate the tar file
print "Writing to file $tar_file\n";
$tar_fh = IO::File->new($tar_file, 'w');
$tar_fh->binmode(1);
# Set handle
$tar_archive->set_handle($tar_fh);
# Output to temporary directory
$output_dir = File::Temp->newdir;
};
# Report on fork message
$pool->run_on_finish (
sub {
my ($pid, $code) = @_;
my $data = pop;
print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
($iter++) . "/$count]" .
($code ? " $code" : '') .
' ' . $data->[0] . "\n";
if (!$code && $to_tar && $data->[2]) {
my $filename = $data->[2];
# Lock filehandle
if (flock($tar_fh, LOCK_EX)) {
my $clean_file = fileparse($filename);
# Archive and remove file
$tar_archive->archive_as($filename => $clean_file);
unlink $filename;
# Unlock filehandle
flock($tar_fh, LOCK_UN);
}
else {
$log->warn("Unable to add $filename to archive");
};
};
$data->[1] = undef if $data->[1];
}
);
my $t;
my $temp;
print "Reading data ...\n";
# unless (Cache::FastMmap->new(
# share_file => $cache_file,
# cache_size => $cache_size,
# init_file => $cache_init
# )) {
# print "Unable to intialize cache '$cache_file'\n\n";
# exit(1);
# };
# Input is a directory
if (-d $input[0]) {
my $it = Directory::Iterator->new($input[0]);
my @dirs;
my $dir;
# Todo: Make a DO WHILE
while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
push @dirs, $dir;
$it->prune;
};
last unless $it->next;
};
print "Start processing ...\n";
$t = Benchmark->new;
$count = scalar @dirs;
DIRECTORY_LOOP:
for (my $i = 0; $i < $count; $i++) {
my $filename = catfile(
$output_dir,
get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
);
# Get the next fork
$pool->start and next DIRECTORY_LOOP;
if (my $return = $batch_file->process($dirs[$i] => $filename)) {
$pool->finish(
0,
[
"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
undef,
$filename
]
);
}
else {
$pool->finish(1, ["Unable to process " . $dirs[$i]]);
};
};
}
# Input is a file
elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
unless ($archive->test_unzip) {
$log->error("Unzip is not installed or incompatible.");
exit 1;
};
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
# Get sigles to extract
my $prefix = set_sigle($archive);
print "Start processing ...\n";
$t = Benchmark->new;
my @dirs = $archive->list_texts;
$count = scalar @dirs;
ARCHIVE_LOOP:
for (my $i = 0; $i < $count; $i++) {
# Split path information
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
my $filename = catfile(
$output_dir,
get_file_name(
catfile($corpus, $doc, $text)
. '.json' . ($gzip ? '.gz' : '')
)
);
# Get the next fork
$pool->start and next ARCHIVE_LOOP;
# Create temporary file
$temp = File::Temp->newdir;
# TODO: Check if $filename exist at the beginning,
# because extraction can be horrible slow!
# Extract from archive
if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
# Create corpus directory
my $input = catdir("$temp", $corpus);
# Temporary directory
my $dir = catdir($input, $doc, $text);
# Write file
if (my $return = $batch_file->process($dir => $filename)) {
# Delete temporary file
$pool->finish(
0,
[
"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
$temp,
$filename
]
);
#$pool->finish(0, ["Processed " . $filename, $temp]);
}
else {
# Delete temporary file
$pool->finish(1, ["Unable to process " . $dir, $temp]);
};
}
# Unable to extract
else {
$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
};
};
}
else {
print "Input is neither a directory nor an archive.\n\n";
};
$pool->wait_all_children;
# Delete cache file
unlink($cache_file) if $cache_delete;
# Close tar filehandle
if ($to_tar && $tar_fh) {
$tar_archive->finish;
$tar_fh->close;
print "Wrote to tar archive.\n";
};
print timestr(timediff(Benchmark->new, $t))."\n";
print "Done.\n";
};
# For an archive, this will create the list
# of all sigles to process
sub set_sigle {
my $archive = shift;
my $prefix = 1;
my @dirs = ();
# No sigles given
unless (@sigle) {
# Get files
foreach ($archive->list_texts) {
push @dirs, $_;
# Split path information
($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
# TODO: Make this OS independent
push @sigle, join '/', $corpus, $doc, $text;
};
}
# Check sigle for doc sigles
else {
my @new_sigle;
my $prefix_check = 0;
# Iterate over all sigle
foreach (@sigle) {
# Sigle is a doc sigle
if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
print "$_ ...";
# Check if a prefix is needed
unless ($prefix_check) {
if ($prefix = $archive->check_prefix) {
print " with prefix ...";
};
$prefix_check = 1;
};
print "\n";
print '... ' . (
$archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
? '' : 'not '
);
print "extracted.\n";
}
# Sigle is a text sigle
else {
push @new_sigle, $_;
unless ($prefix_check) {
if ($prefix = $archive->check_prefix) {
print " with prefix ...";
};
$prefix_check = 1;
};
};
};
@sigle = @new_sigle;
};
return $prefix;
};
# Cleanup temporary extraction directory
if ($extract_dir) {
my $objects = remove_tree($extract_dir, { safe => 1 });
print "Removed directory $extract_dir with $objects objects.\n";
};
print "\n";
__END__
=pod
=encoding utf8
=head1 NAME
korapxml2krill - Merge KorapXML data and create Krill documents
=head1 SYNOPSIS
korapxml2krill [archive|extract] --input <directory|archive> [options]
=head1 DESCRIPTION
L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
The C<korapxml2krill> command line tool is a simple wrapper to the library.
=head1 INSTALLATION
The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
In case everything went well, the C<korapxml2krill> tool will
be available on your command line immediately.
Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
In addition to work with zip archives, the C<unzip> tool needs to be present.
=head1 ARGUMENTS
$ korapxml2krill -z --input <directory> --output <filename>
Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
It expects the input to point to the text level folder.
=over 2
=item B<archive>
$ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Converts an archive of KorAP-XML documents. It expects a directory
(pointing to the corpus level folder) or one or more zip files as input.
=item B<extract>
$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Extracts KorAP-XML documents from a zip file.
=item B<serial>
$ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Convert archives sequentially. The inputs are not merged but treated
as they are (so they may be premerged or globs).
the C<--out> directory is treated as the base directory where subdirectories
are created based on the archive name. In case the C<--to-tar> flag is given,
the output will be a tar file.
=back
=head1 OPTIONS
=over 2
=item B<--input|-i> <directory|zip file>
Directory or zip file(s) of documents to convert.
Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
document, while C<archive> expects a KorAP-XML corpus folder or a zip
file to batch process multiple files.
C<extract> expects zip files only.
C<archive> supports multiple input zip files with the constraint,
that the first archive listed contains all primary data files
and all meta data files.
-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Input may also be defined using BSD glob wildcards.
-i 'file/news*.zip'
The extended input array will be sorted in length order, so the shortest
path needs to contain all primary data files and all meta data files.
(The directory structure follows the base directory format,
that may include a C<.> root folder.
In this case further archives lacking a C<.> root folder
need to be passed with a hash sign in front of the archive's name.
This may require to quote the parameter.)
To support zip files, a version of C<unzip> needs to be installed that is
compatible with the archive file.
B<The root folder switch using the hash sign is experimental and
may vanish in future versions.>
=item B<--input-base|-ib> <directory>
The base directory for inputs.
=item B<--output|-o> <directory|file>
Output folder for archive processing or
document name for single output (optional),
writes to C<STDOUT> by default
(in case C<output> is not mandatory due to further options).
=item B<--overwrite|-w>
Overwrite files that already exist.
=item B<--token|-t> <foundry>#<file>
Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to C<OpenNLP#tokens>.
=item B<--base-sentences|-bs> <foundry>#<layer>
Define the layer for base sentences.
If given, this will be used instead of using C<Base#Sentences>.
Currently C<DeReKo#Structure> is the only additional layer supported.
Defaults to unset.
=item B<--base-paragraphs|-bp> <foundry>#<layer>
Define the layer for base paragraphs.
If given, this will be used instead of using C<Base#Paragraphs>.
Currently C<DeReKo#Structure> is the only additional layer supported.
Defaults to unset.
=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
Define the layer for base pagebreaks.
Currently C<DeReKo#Structure> is the only layer supported.
Defaults to unset.
=item B<--skip|-s> <foundry>[#<layer>]
Skip specific annotations by specifying the foundry
(and optionally the layer with a C<#>-prefix),
e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Can be set multiple times.
=item B<--anno|-a> <foundry>#<layer>
Convert specific annotations by specifying the foundry
(and optionally the layer with a C<#>-prefix),
e.g. C<Mate> or C<Mate#Morpho>.
Can be set multiple times.
=item B<--primary|-p>
Output primary data or not. Defaults to C<true>.
Can be flagged using C<--no-primary> as well.
This is I<deprecated>.
=item B<--non-word-tokens|-nwt>
Tokenize non-word tokens like word tokens (defined as matching
C</[\d\w]/>). Useful to treat punctuations as tokens.
Defaults to unset.
=item B<--jobs|-j>
Define the number of concurrent jobs in seperated forks
for archive processing.
Defaults to C<0> (everything runs in a single process).
If C<sequential-extraction> is not set to false, this will
also apply to extraction.
Pass -1, and the value will be set automatically to 5
times the number of available cores.
This is I<experimental>.
=item B<--koral|-k>
Version of the output format. Supported versions are:
C<0> for legacy serialization, C<0.03> for serialization
with metadata fields as key-values on the root object,
C<0.4> for serialization with metadata fields as a list
of C<"@type":"koral:field"> objects.
Currently defaults to C<0.03>.
=item B<--sequential-extraction|-se>
Flag to indicate, if the C<jobs> value also applies to extraction.
Some systems may have problems with extracting multiple archives
to the same folder at the same time.
Can be flagged using C<--no-sequential-extraction> as well.
Defaults to C<false>.
=item B<--meta|-m>
Define the metadata parser to use. Defaults to C<I5>.
Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
This is I<experimental>.
=item B<--pretty|-y>
Pretty print JSON output. Defaults to C<false>.
This is I<deprecated>.
=item B<--gzip|-z>
Compress the output.
Expects a defined C<output> file in single processing.
=item B<--cache|-c>
File to mmap a cache (using L<Cache::FastMmap>).
Defaults to C<korapxml2krill.cache> in the calling directory.
=item B<--cache-size|-cs>
Size of the cache. Defaults to C<50m>.
=item B<--cache-init|-ci>
Initialize cache file.
Can be flagged using C<--no-cache-init> as well.
Defaults to C<true>.
=item B<--cache-delete|-cd>
Delete cache file after processing.
Can be flagged using C<--no-cache-delete> as well.
Defaults to C<true>.
=item B<--config|-cfg>
Configure the parameters of your call in a file
of key-value pairs with whitespace separator
overwrite 1
token DeReKo#Structure
...
Supported parameters are:
C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
C<output>, C<koral>,
C<tempary-extract>, C<sequential-extraction>,
C<base-sentences>, C<base-paragraphs>,
C<base-pagebreaks>,
C<skip> (semicolon separated), C<sigle>
(semicolon separated), C<anno> (semicolon separated).
Configuration parameters will always be overwritten by
passed parameters.
=item B<--temporary-extract|-te>
Only valid for the C<archive> command.
This will first extract all files into a
directory and then will archive.
If the directory is given as C<:temp:>,
a temporary directory is used.
This is especially useful to avoid
massive unzipping and potential
network latency.
=item B<--sigle|-sg>
Extract the given texts.
Can be set multiple times.
I<Currently only supported on C<extract>.>
Sigles have the structure C<Corpus>/C<Document>/C<Text>.
In case the C<Text> path is omitted, the whole document will be extracted.
On the document level, the postfix wildcard C<*> is supported.
=item B<--log|-l>
The L<Log4perl> log level, defaults to C<ERROR>.
=item B<--help|-h>
Print this document.
=item B<--version|-v>
Print version information.
=back
=head1 ANNOTATION SUPPORT
L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
developed in the KorAP project that are part of the KorAP preprocessing pipeline.
The base foundry with paragraphs, sentences, and the text element are mandatory for
L<Krill|https://github.com/KorAP/Krill>.
Base
#Paragraphs
#Sentences
Connexor
#Morpho
#Phrase
#Sentences
#Syntax
CoreNLP
#Constituency
#Morpho
#NamedEntities
#Sentences
CMC
#Morpho
DeReKo
#Structure
DRuKoLa
#Morpho
Glemm
#Morpho
HNC
#Morpho
LWC
#Dependency
Malt
#Dependency
MarMoT
#Morpho
Mate
#Dependency
#Morpho
MDParser
#Dependency
OpenNLP
#Morpho
#Sentences
Sgbr
#Lemma
#Morpho
TreeTagger
#Morpho
#Sentences
XIP
#Constituency
#Morpho
#Sentences
More importers are in preparation.
New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
See the built-in annotation importers as examples.
=head1 AVAILABILITY
https://github.com/KorAP/KorAP-XML-Krill
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Author: L<Nils Diewald|http://nils-diewald.de/>
Contributor: Eliza Margaretha
L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
This program is free software published under the
L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
=cut