blob: fd95337ada9996839204abaf1a429f7f18c05d1b [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use FindBin;
BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
use File::Spec::Functions qw/catfile catdir/;
use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
use Benchmark qw/:hireswallclock/;
use IO::Compress::Gzip qw/$GzipError/;
use POSIX qw/ceil/;
use Log::Log4perl;
use Pod::Usage;
use Cache::FastMmap;
use Directory::Iterator;
use KorAP::XML::Krill;
use KorAP::XML::Archive;
use KorAP::XML::Tokenizer;
use KorAP::XML::Batch::File;
use Config::Simple;
use Parallel::ForkManager;
use v5.10;
use Sys::Info;
use Sys::Info::Constants qw( :device_cpu );
use File::Glob ':bsd_glob';
use File::Temp qw/tempdir/;
use File::Path qw(remove_tree make_path);
use File::Basename;
use Mojo::Collection 'c';
use String::Random qw(random_string);
use IO::File;
use Archive::Tar::Builder;
use Fcntl qw(:flock SEEK_END);
# use KorAP::XML::ForkPool;
# TODO: use Parallel::Loops
# TODO: make output files
# TODO: Use KorAP::XML::ForkPool!
# CHANGES:
# ----------------------------------------------------------
# 2013/11/25
# - Initial release
#
# 2014/10/29
# - Merges foundry data to create indexer friendly documents
#
# 2016/02/04
# - renamed to korapxml2krill
# - added Schreibgebrauch support
#
# 2016/02/12
# - fixed foundry skipping
# - Support overwrite in archive processing
#
# 2016/02/14
# - Added version information
# - Added support for archive files
#
# 2016/02/15
# - Fixed temporary directory bug
# - Improved skipping before unzipping
# - Added EXPERIMENTAL concurrency support
#
# 2016/02/23
# - Merge korapxml2krill and korapxml2krill_dir
#
# 2016/02/27
# - Added extract function
#
# 2016/03/17
# - Added meta switch
#
# 2016/03/18
# - Added meta data caching
#
# 2016/06/27
# - Added multi archive support
# - Added prefix negation support
# - Added Malt#Dependency support
#
# 2016/07/06
# - Added MDParser#Dependency
#
# 2016/10/15
# - Fixed temporary path issue in script
#
# 2016/10/24
# - Improved Windows support
#
# 2016/10/24
# - Added support for document extraction
#
# 2016/10/27
# - Added wildcard support for document extraction
#
# 2016/12/21
# - added support for base-sentences and base-tokenizations
#
# 2017/01/20
# - added support for DRuKoLa annotations
#
# 2017/02/08
# - added support for pagebreak annotations
#
# 2017/04/06
# - added support for wildcards in input
#
# 2017/04/07
# - support configuration option
# - support for temporary extraction
#
# 2017/04/12
# - support serial processing
# - support input root
# - introduced --sequential-extraction flag
#
# 2017/06/19
# - added support for DCK
#
# 2017/06/29
# - Fixed exit codes
#
# 2017/07/04
# - Fixed tar building process
#
# 2018/01/16
# - Added LWC support
#
# 2018/07/19
# - Preliminary support for HNC.
#
# 2019/01/22
# - Preliminary support for DGD.
# - Support for non-word tokens.
#
# 2019/02/13
# - Support for 'koral:field' array.
# - Support for Koral versioning.
# - Ignore temporary extract parameter on
# directory archiving.
#
# 2019/08/08
# - Support for Talismane.
#
# 2019/12/17
# - Added support for DGD pseudo-sentences
# based on anchor milestones.
# - Support for non-verbal annotations.
# ----------------------------------------------------------
our $LAST_CHANGE = '2019/12/17';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
VERSION
# Prototypes
sub get_file_name_from_glob($);
sub get_file_name($);
# Parse comand
my $cmd;
our @ARGV;
if ($ARGV[0] && index($ARGV[0], '-') != 0) {
$cmd = shift @ARGV;
};
my @keep_argv = @ARGV;
my (@skip, @sigle, @anno, @input);
my $text;
# Parse options from the command line
GetOptions(
'input|i=s' => \@input,
'input-base|ib=s' => \(my $input_base),
'output|o=s' => \(my $output),
'overwrite|w' => \(my $overwrite),
'meta|m=s' => \(my $meta),
'token|t=s' => \(my $token_base),
'base-sentences|bs=s' => \(my $base_sentences),
'base-paragraphs|bp=s' => \(my $base_paragraphs),
'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
'gzip|z' => \(my $gzip),
'temporary-extract|te=s' => \(my $extract_dir),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
'cache|c=s' => \(my $cache_file),
'config|cfg=s' => \(my $cfg_file),
'log|l=s' => \(my $log_level),
'anno|a=s' => \@anno,
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
'jobs|j=i' => \(my $jobs),
'koral|k=f' => \(my $koral),
'to-tar' => \(my $to_tar),
'non-word-tokens|nwt' => \(my $non_word_tokens),
'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
'sequential-extraction|se' => \(my $sequential_extraction),
'cache-size|cs=s' => \(my $cache_size),
'cache-delete|cd!' => \(my $cache_delete),
'cache-init|ci!' => \(my $cache_init),
'help|h' => sub {
pod2usage(
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
-msg => $VERSION_MSG,
-output => '-'
);
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG,
-output => '-'
)
}
);
# Load from configuration
if ($cfg_file && -e $cfg_file) {
my %config;
Config::Simple->import_from($cfg_file, \%config);
# Overwrite
if (!defined($overwrite) && defined $config{overwrite}) {
$overwrite = $config{overwrite};
};
# Gzip
if (!defined($gzip) && defined $config{gzip}) {
$gzip = $config{gzip};
};
# Jobs
if (!defined($jobs) && defined $config{jobs}) {
$jobs = $config{jobs};
};
# Koral version
if (!defined($koral) && defined $config{koral}) {
$koral = $config{koral};
};
# Input root base directory
if (!defined($input_base) && defined $config{'input-base'}) {
$input_base = $config{'input-base'};
};
# temporary-extract
if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
$extract_dir = $config{'temporary-extract'};
};
# Token base
if (!defined($token_base) && defined $config{token}) {
$token_base = $config{token};
};
# Non-word tokenization
if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
$non_word_tokens = $config{'non-word-tokens'};
};
# Non-verbal tokenization
if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
$non_verbal_tokens = $config{'non-verbal-tokens'};
};
# Cache file
if (!defined($cache_file) && defined $config{cache}) {
$cache_file = $config{cache};
};
# Cache size
if (!defined($cache_size) && defined $config{'cache-size'}) {
$cache_size = $config{'cache-size'};
};
# Cache delete
if (!defined($cache_delete) && defined $config{'cache-delete'}) {
$cache_delete = $config{'cache-delete'} ;
};
# Cache init
if (!(defined $cache_init) && defined $config{'cache-init'}) {
$cache_init = $config{'cache-init'} ;
};
# Jobs for extraction
if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
$sequential_extraction = $config{'sequential-extraction'} ;
};
# Meta
if (!(defined $meta) && defined $config{'meta'}) {
$meta = $config{'meta'} ;
};
# Output
if (!(defined $output) && defined $config{'output'}) {
$output = $config{'output'} ;
};
# Base-sentences
if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
$base_sentences = $config{'base-sentences'} ;
};
# Base-paragraphs
if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
$base_paragraphs = $config{'base-paragraphs'} ;
};
# Base-pagebreaks
if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
$base_pagebreaks = $config{'base-pagebreaks'} ;
};
# Write to tar
if (!(defined $to_tar) && defined $config{'to-tar'}) {
$to_tar = $config{'to-tar'} ;
};
# Log
if (!(defined $log_level) && defined $config{'log'}) {
$log_level = $config{'log'} ;
};
# Skip
if (!scalar(@skip) && defined $config{'skip'}) {
@skip = split /\s*;\s*/, $config{'skip'} ;
};
# Sigle
if (!scalar(@sigle) && defined $config{'sigle'}) {
@sigle = split /\s*;\s*/, $config{'sigle'} ;
};
# Anno
if (!scalar(@anno) && defined $config{'anno'}) {
@anno = split /\s*;\s*/, $config{'anno'} ;
};
};
# Set default token base
$token_base //= 'OpenNLP#tokens';
$cache_file //= 'korapxml2krill.cache';
$cache_size //= '50m';
$jobs //= 0;
$koral //= $KORAL_VERSION;
$cache_delete //= 1;
$cache_init //= 1;
$sequential_extraction //= 0;
$log_level //= 'ERROR';
$base_sentences //= '';
$base_paragraphs //= '';
$base_pagebreaks //= '';
$non_word_tokens //= 0;
$non_verbal_tokens //= 0;
$base_sentences = lc $base_sentences;
$base_paragraphs = lc $base_paragraphs;
$base_pagebreaks = lc $base_pagebreaks;
# Initialize log4perl object
Log::Log4perl->init({
'log4perl.rootLogger' => uc($log_level) . ', STDERR',
'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
'log4perl.appender.STDERR.layout' => 'PatternLayout',
'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
});
my $log = Log::Log4perl->get_logger('main');
print "Reading config from $cfg_file\n" if $cfg_file;
my %ERROR_HASH = (
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
-verbose => 99,
-msg => $VERSION_MSG,
-output => '-',
-exit => 1
);
# Input has to be defined
pod2usage(%ERROR_HASH) unless @input;
# Gzip has no effect, if no output is given
pod2usage(%ERROR_HASH) if $gzip && !$output;
if ($jobs eq '-1') {
state $cores = Sys::Info->new->device('CPU')->count;
$jobs = ceil(5 * $cores);
$log->info("Run using $jobs jobs on $cores cores");
};
# Start serial processing
if ($cmd && $cmd eq 'serial') {
if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
$log->error("Directory '$output' does not exist.");
exit 1;
};
# Remove all inputs
my $remove_next = 0;
@keep_argv = @{c(@keep_argv)->grep(
sub {
# Input flag
if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
$remove_next = 1;
return 0;
}
# input value
elsif ($remove_next) {
$remove_next = 0;
return 0;
};
# Pass parameter
return 1;
}
)->to_array};
# Iterate over all inputs
foreach (@input) {
# This will create a directory
my $new_out = catdir($output, get_file_name_from_glob($_));
# Create new path, in case the output is not meant to be tarred
unless ($to_tar) {
if (make_path($new_out) == 0 && !-d $new_out) {
$log->error("Can\'t create path $new_out");
exit 1;
};
};
# Create archive command
my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
print "Start serial processing of $_ to $new_out\n";
# Start archiving
system @archive_cmd;
};
exit;
};
my %skip;
$skip{lc($_)} = 1 foreach @skip;
my @layers;
push(@layers, ['Base', 'Sentences']) unless $base_sentences;
push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
# Connexor
push(@layers, ['Connexor', 'Morpho']);
push(@layers, ['Connexor', 'Syntax']);
push(@layers, ['Connexor', 'Phrase']);
push(@layers, ['Connexor', 'Sentences']);
# CoreNLP
push(@layers, ['CoreNLP', 'NamedEntities']);
push(@layers, ['CoreNLP', 'Sentences']);
push(@layers, ['CoreNLP', 'Morpho']);
push(@layers, ['CoreNLP', 'Constituency']);
# CMC
push(@layers, ['CMC', 'Morpho']);
# DeReKo
my @dereko_attr = ();
if ($base_sentences eq 'dereko#structure') {
push @dereko_attr, 'sentences';
};
if ($base_paragraphs eq 'dereko#structure') {
push @dereko_attr, 'paragraphs';
};
if ($base_pagebreaks eq 'dereko#structure') {
push @dereko_attr, 'pagebreaks';
};
if ($dereko_attr[0]) {
push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
}
else {
push(@layers, ['DeReKo', 'Structure']);
};
# DGD
push(@layers, ['DGD', 'Morpho']);
if ($base_sentences eq 'dgd#structure') {
push(@layers, ['DGD', 'Structure', 'base-sentence']);
}
# DRuKoLa
push(@layers, ['DRuKoLa', 'Morpho']);
# Glemm
push(@layers, ['Glemm', 'Morpho']);
# HNC
push(@layers, ['HNC', 'Morpho']);
# LWC
push(@layers, ['LWC', 'Dependency']);
# Malt
push(@layers, ['Malt', 'Dependency']);
# Marmot
push(@layers, ['MarMoT', 'Morpho']);
# Mate
push(@layers, ['Mate', 'Morpho']);
push(@layers, ['Mate', 'Dependency']);
# MDParser
push(@layers, ['MDParser', 'Dependency']);
# OpenNLP
push(@layers, ['OpenNLP', 'Morpho']);
push(@layers, ['OpenNLP', 'Sentences']);
# Schreibgebrauch
push(@layers, ['Sgbr', 'Lemma']);
push(@layers, ['Sgbr', 'Morpho']);
# Talismane
push(@layers, ['Talismane', 'Dependency']);
push(@layers, ['Talismane', 'Morpho']);
# TreeTagger
push(@layers, ['TreeTagger', 'Morpho']);
push(@layers, ['TreeTagger', 'Sentences']);
# XIP
push(@layers, ['XIP', 'Morpho']);
push(@layers, ['XIP', 'Constituency']);
push(@layers, ['XIP', 'Sentences']);
push(@layers, ['XIP', 'Dependency']);
# Check filters
my @filtered_anno;
if ($skip{'#all'}) {
foreach (@anno) {
push @filtered_anno, [ split('#', $_) ];
};
}
# Add all annotations that are not skipped
else {
# Add to index file - respect skipping
foreach my $info (@layers) {
# Skip if Foundry or Foundry#Layer should be skipped
unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
push @filtered_anno, $info;
};
};
};
# Get tokenization basis
my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
# Remove file extension
$token_base_layer =~ s/\.xml$//i;
# TODO: This should not be initialized for batch
my $cache = Cache::FastMmap->new(
share_file => $cache_file,
cache_size => $cache_size,
init_file => $cache_init
);
# Create batch object
my $batch_file = KorAP::XML::Batch::File->new(
cache => $cache,
meta_type => $meta,
overwrite => $overwrite,
foundry => $token_base_foundry,
layer => $token_base_layer,
gzip => $gzip,
log => $log,
koral => $koral,
primary => $primary,
pretty => $pretty,
anno => \@filtered_anno,
non_word_tokens => $non_word_tokens,
non_verbal_tokens => $non_verbal_tokens
);
# Get file name based on path information
sub get_file_name ($) {
my $i = $input[0];
if (-d $i) {
$i =~ s![^\/]+$!!;
};
my $file = shift;
# Remove temp dir fragments
$file =~ s!^/?tmp/[^/]+!!;
$file =~ s/^?\/?$i//;
$file =~ tr/\//-/;
$file =~ s{^-+}{};
$file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
return $file;
};
sub get_file_name_from_glob ($) {
my $glob = shift;
$glob =~ s![\\\/]!-!g; # Transform paths
$glob =~ s/[\*\?]//g; # Remove arbitrary fills
$glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
$glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
$glob =~ s/^-//; # Clean beginning
$glob =~ s/-$//; # Clean end
$glob =~ s/\.zip$//; # Remove file extension
return $glob;
};
# Convert sigle to path construct
s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
if ($cmd) {
if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
$log->error("Directory '$output' does not exist.");
exit 1;
};
};
# Glob and prefix files
if (@input) {
my @new_input = ();
# Iterate over all inputs
foreach my $wild_card (@input) {
# Prefix with input root
$wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
push (@new_input, bsd_glob($wild_card));
};
# Sort files by length
@input = sort { length($a) <=> length($b) } @new_input;
print 'Input is ' . join(', ', @input)."\n";
};
# Process a single file
unless ($cmd) {
my $input = $input[0];
BEGIN {
$main::TIME = Benchmark->new;
$main::LAST_STOP = Benchmark->new;
};
sub stop_time {
my $new = Benchmark->new;
$log->info(
'The code took: '.
timestr(timediff($new, $main::LAST_STOP)) .
' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
);
$main::LAST_STOP = $new;
};
# Create and parse new document
$input =~ s{([^/])$}{$1/};
# Process file
$batch_file->process($input, $output);
# Delete cache file
unlink($cache_file) if $cache_delete;
stop_time;
exit;
};
# Extract XML files
if ($cmd eq 'extract') {
# Output is required
pod2usage(%ERROR_HASH) unless $output;
# Create new archive object
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
# Check zip capabilities
unless ($archive->test_unzip) {
$log->error("Unzip is not installed or incompatible.");
exit 1;
};
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
# Will set @sigle
my $prefix = set_sigle($archive);
# Iterate over all given sigles and extract
foreach (@sigle) {
print "$_ ...\n";
# TODO: Make this OS independent
print '... ' . (
# TODO:
# - prefix???
$archive->extract_sigle([$_], $output, $jobs)
? '' : 'not '
);
print "extracted.\n";
};
}
# Can't create archive object
else {
$log->error('Unable to extract from primary archive ' . $input[0]);
exit 1;
};
}
# Process an archive
elsif ($cmd eq 'archive') {
my $archive_output;
# First extract, then archive
if (defined $extract_dir && !-d $input[0]) {
# Create new archive object
if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
# Check zip capabilities
unless ($archive->test_unzip) {
$log->error("Unzip is not installed or incompatible.");
exit 1;
};
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
# Create a temporary directory
if ($extract_dir eq ':temp:') {
$extract_dir = tempdir(CLEANUP => 0);
print "Temporarily extract to $extract_dir\n";
};
# Add some random extra to avoid clashes with multiple archives
$extract_dir = catdir($extract_dir, random_string('cccccc'));
# Extract to temporary directory
if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
@input = ($extract_dir);
}
else {
$log->error('Unable to extract from primary archive ' . $input[0] .
' to ' . $extract_dir);
exit 1;
};
}
# Can't create archive object
else {
$log->error('Unable to extract from primary archive ' . $input[0]);
exit 1;
};
};
# Zero means: everything runs in the parent process
my $pool = Parallel::ForkManager->new($jobs);
my $count = 0; # Texts to process
my $iter = 1; # Current text in process
my $tar_archive;
my $output_dir = $output;
my $tar_fh;
# Initialize tar archive
if ($to_tar) {
$tar_archive = Archive::Tar::Builder->new(
ignore_errors => 1
);
# Set output name
my $tar_file = $output;
unless ($tar_file =~ /\.tar$/) {
$tar_file .= '.tar';
};
# Initiate the tar file
print "Writing to file $tar_file\n";
$tar_fh = IO::File->new($tar_file, 'w');
$tar_fh->binmode(1);
# Set handle
$tar_archive->set_handle($tar_fh);
# Output to temporary directory
$output_dir = File::Temp->newdir;
};
# Report on fork message
$pool->run_on_finish (
sub {
my ($pid, $code) = @_;
my $data = pop;
print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
($iter++) . "/$count]" .
($code ? " $code" : '') .
' ' . $data->[0] . "\n";
if (!$code && $to_tar && $data->[2]) {
my $filename = $data->[2];
# Lock filehandle
if (flock($tar_fh, LOCK_EX)) {
my $clean_file = fileparse($filename);
# Archive and remove file
$tar_archive->archive_as($filename => $clean_file);
unlink $filename;
# Unlock filehandle
flock($tar_fh, LOCK_UN);
}
else {
$log->warn("Unable to add $filename to archive");
};
};
$data->[1] = undef if $data->[1];
}
);
my $t;
my $temp;
print "Reading data ...\n";
# unless (Cache::FastMmap->new(
# share_file => $cache_file,
# cache_size => $cache_size,
# init_file => $cache_init
# )) {
# print "Unable to intialize cache '$cache_file'\n\n";
# exit(1);
# };
# Input is a directory
if (-d $input[0]) {
my $it = Directory::Iterator->new($input[0]);
my @dirs;
my $dir;
# Todo: Make a DO WHILE
while (1) {
if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
push @dirs, $dir;
$it->prune;
};
last unless $it->next;
};
print "Start processing ...\n";
$t = Benchmark->new;
$count = scalar @dirs;
DIRECTORY_LOOP:
for (my $i = 0; $i < $count; $i++) {
my $filename = catfile(
$output_dir,
get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
);
# Get the next fork
$pool->start and next DIRECTORY_LOOP;
if (my $return = $batch_file->process($dirs[$i] => $filename)) {
$pool->finish(
0,
[
"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
undef,
$filename
]
);
}
else {
$pool->finish(1, ["Unable to process " . $dirs[$i]]);
};
};
}
# Input is a file
elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
unless ($archive->test_unzip) {
$log->error("Unzip is not installed or incompatible.");
exit 1;
};
# Add further annotation archived
$archive->attach($_) foreach @input[1..$#input];
# Get sigles to extract
my $prefix = set_sigle($archive);
print "Start processing ...\n";
$t = Benchmark->new;
my @dirs = $archive->list_texts;
$count = scalar @dirs;
ARCHIVE_LOOP:
for (my $i = 0; $i < $count; $i++) {
# Split path information
my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
my $filename = catfile(
$output_dir,
get_file_name(
catfile($corpus, $doc, $text)
. '.json' . ($gzip ? '.gz' : '')
)
);
# Get the next fork
$pool->start and next ARCHIVE_LOOP;
# Create temporary file
$temp = File::Temp->newdir;
# TODO: Check if $filename exist at the beginning,
# because extraction can be horrible slow!
# Extract from archive
if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
# Create corpus directory
my $input = catdir("$temp", $corpus);
# Temporary directory
my $dir = catdir($input, $doc, $text);
# Write file
if (my $return = $batch_file->process($dir => $filename)) {
# Delete temporary file
$pool->finish(
0,
[
"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
$temp,
$filename
]
);
#$pool->finish(0, ["Processed " . $filename, $temp]);
}
else {
# Delete temporary file
$pool->finish(1, ["Unable to process " . $dir, $temp]);
};
}
# Unable to extract
else {
$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
};
};
}
else {
print "Input is neither a directory nor an archive.\n\n";
};
$pool->wait_all_children;
# Delete cache file
unlink($cache_file) if $cache_delete;
# Close tar filehandle
if ($to_tar && $tar_fh) {
$tar_archive->finish;
$tar_fh->close;
print "Wrote to tar archive.\n";
};
print timestr(timediff(Benchmark->new, $t))."\n";
print "Done.\n";
};
# For an archive, this will create the list
# of all sigles to process
sub set_sigle {
my $archive = shift;
my $prefix = 1;
my @dirs = ();
# No sigles given
unless (@sigle) {
# Get files
foreach ($archive->list_texts) {
push @dirs, $_;
# Split path information
($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
# TODO: Make this OS independent
push @sigle, join '/', $corpus, $doc, $text;
};
}
# Check sigle for doc sigles
else {
my @new_sigle;
my $prefix_check = 0;
# Iterate over all sigle
foreach (@sigle) {
# Sigle is a doc sigle
if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
print "$_ ...";
# Check if a prefix is needed
unless ($prefix_check) {
if ($prefix = $archive->check_prefix) {
print " with prefix ...";
};
$prefix_check = 1;
};
print "\n";
print '... ' . (
$archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
? '' : 'not '
);
print "extracted.\n";
}
# Sigle is a text sigle
else {
push @new_sigle, $_;
unless ($prefix_check) {
if ($prefix = $archive->check_prefix) {
print " with prefix ...";
};
$prefix_check = 1;
};
};
};
@sigle = @new_sigle;
};
return $prefix;
};
# Cleanup temporary extraction directory
if ($extract_dir) {
my $objects = remove_tree($extract_dir, { safe => 1 });
print "Removed directory $extract_dir with $objects objects.\n";
};
print "\n";
__END__
=pod
=encoding utf8
=head1 NAME
korapxml2krill - Merge KorapXML data and create Krill documents
=head1 SYNOPSIS
korapxml2krill [archive|extract] --input <directory|archive> [options]
=head1 DESCRIPTION
L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
The C<korapxml2krill> command line tool is a simple wrapper of this library.
=head1 INSTALLATION
The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
In case everything went well, the C<korapxml2krill> tool will
be available on your command line immediately.
Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
In addition to work with zip archives, the C<unzip> tool needs to be present.
=head1 ARGUMENTS
$ korapxml2krill -z --input <directory> --output <filename>
Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
It expects the input to point to the text level folder.
=over 2
=item B<archive>
$ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Converts an archive of KorAP-XML documents. It expects a directory
(pointing to the corpus level folder) or one or more zip files as input.
=item B<extract>
$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
Extracts KorAP-XML documents from a zip file.
=item B<serial>
$ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
Convert archives sequentially. The inputs are not merged but treated
as they are (so they may be premerged or globs).
the C<--out> directory is treated as the base directory where subdirectories
are created based on the archive name. In case the C<--to-tar> flag is given,
the output will be a tar file.
=back
=head1 OPTIONS
=over 2
=item B<--input|-i> <directory|zip file>
Directory or zip file(s) of documents to convert.
Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
document, while C<archive> expects a KorAP-XML corpus folder or a zip
file to batch process multiple files.
C<extract> expects zip files only.
C<archive> supports multiple input zip files with the constraint,
that the first archive listed contains all primary data files
and all meta data files.
-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Input may also be defined using BSD glob wildcards.
-i 'file/news*.zip'
The extended input array will be sorted in length order, so the shortest
path needs to contain all primary data files and all meta data files.
(The directory structure follows the base directory format,
that may include a C<.> root folder.
In this case further archives lacking a C<.> root folder
need to be passed with a hash sign in front of the archive's name.
This may require to quote the parameter.)
To support zip files, a version of C<unzip> needs to be installed that is
compatible with the archive file.
B<The root folder switch using the hash sign is experimental and
may vanish in future versions.>
=item B<--input-base|-ib> <directory>
The base directory for inputs.
=item B<--output|-o> <directory|file>
Output folder for archive processing or
document name for single output (optional),
writes to C<STDOUT> by default
(in case C<output> is not mandatory due to further options).
=item B<--overwrite|-w>
Overwrite files that already exist.
=item B<--token|-t> <foundry>#<file>
Define the default tokenization by specifying
the name of the foundry and optionally the name
of the layer-file. Defaults to C<OpenNLP#tokens>.
This will directly take the file instead of running
the layer implementation!
=item B<--base-sentences|-bs> <foundry>#<layer>
Define the layer for base sentences.
If given, this will be used instead of using C<Base#Sentences>.
Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
layers supported.
Defaults to unset.
=item B<--base-paragraphs|-bp> <foundry>#<layer>
Define the layer for base paragraphs.
If given, this will be used instead of using C<Base#Paragraphs>.
Currently C<DeReKo#Structure> is the only additional layer supported.
Defaults to unset.
=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
Define the layer for base pagebreaks.
Currently C<DeReKo#Structure> is the only layer supported.
Defaults to unset.
=item B<--skip|-s> <foundry>[#<layer>]
Skip specific annotations by specifying the foundry
(and optionally the layer with a C<#>-prefix),
e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Can be set multiple times.
=item B<--anno|-a> <foundry>#<layer>
Convert specific annotations by specifying the foundry
(and optionally the layer with a C<#>-prefix),
e.g. C<Mate> or C<Mate#Morpho>.
Can be set multiple times.
=item B<--primary|-p>
Output primary data or not. Defaults to C<true>.
Can be flagged using C<--no-primary> as well.
This is I<deprecated>.
=item B<--non-word-tokens|-nwt>
Tokenize non-word tokens like word tokens (defined as matching
C</[\d\w]/>). Useful to treat punctuations as tokens.
Defaults to unset.
=item B<--non-verbal-tokens|-nvt>
Tokenize non-verbal tokens marked as in the primary data as
the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
Defaults to unset.
=item B<--jobs|-j>
Define the number of concurrent jobs in seperated forks
for archive processing.
Defaults to C<0> (everything runs in a single process).
If C<sequential-extraction> is not set to false, this will
also apply to extraction.
Pass -1, and the value will be set automatically to 5
times the number of available cores.
This is I<experimental>.
=item B<--koral|-k>
Version of the output format. Supported versions are:
C<0> for legacy serialization, C<0.03> for serialization
with metadata fields as key-values on the root object,
C<0.4> for serialization with metadata fields as a list
of C<"@type":"koral:field"> objects.
Currently defaults to C<0.03>.
=item B<--sequential-extraction|-se>
Flag to indicate, if the C<jobs> value also applies to extraction.
Some systems may have problems with extracting multiple archives
to the same folder at the same time.
Can be flagged using C<--no-sequential-extraction> as well.
Defaults to C<false>.
=item B<--meta|-m>
Define the metadata parser to use. Defaults to C<I5>.
Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
This is I<experimental>.
=item B<--pretty|-y>
Pretty print JSON output. Defaults to C<false>.
This is I<deprecated>.
=item B<--gzip|-z>
Compress the output.
Expects a defined C<output> file in single processing.
=item B<--cache|-c>
File to mmap a cache (using L<Cache::FastMmap>).
Defaults to C<korapxml2krill.cache> in the calling directory.
=item B<--cache-size|-cs>
Size of the cache. Defaults to C<50m>.
=item B<--cache-init|-ci>
Initialize cache file.
Can be flagged using C<--no-cache-init> as well.
Defaults to C<true>.
=item B<--cache-delete|-cd>
Delete cache file after processing.
Can be flagged using C<--no-cache-delete> as well.
Defaults to C<true>.
=item B<--config|-cfg>
Configure the parameters of your call in a file
of key-value pairs with whitespace separator
overwrite 1
token DeReKo#Structure
...
Supported parameters are:
C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
C<output>, C<koral>,
C<tempary-extract>, C<sequential-extraction>,
C<base-sentences>, C<base-paragraphs>,
C<base-pagebreaks>,
C<skip> (semicolon separated), C<sigle>
(semicolon separated), C<anno> (semicolon separated).
Configuration parameters will always be overwritten by
passed parameters.
=item B<--temporary-extract|-te>
Only valid for the C<archive> command.
This will first extract all files into a
directory and then will archive.
If the directory is given as C<:temp:>,
a temporary directory is used.
This is especially useful to avoid
massive unzipping and potential
network latency.
=item B<--to-tar>
Only valid for the C<archive> command.
Writes the output into a tar archive.
=item B<--sigle|-sg>
Extract the given texts.
Can be set multiple times.
I<Currently only supported on C<extract>.>
Sigles have the structure C<Corpus>/C<Document>/C<Text>.
In case the C<Text> path is omitted, the whole document will be extracted.
On the document level, the postfix wildcard C<*> is supported.
=item B<--log|-l>
The L<Log4perl> log level, defaults to C<ERROR>.
=item B<--help|-h>
Print this document.
=item B<--version|-v>
Print version information.
=back
=head1 ANNOTATION SUPPORT
L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
developed in the KorAP project that are part of the KorAP preprocessing pipeline.
The base foundry with paragraphs, sentences, and the text element are mandatory for
L<Krill|https://github.com/KorAP/Krill>.
Base
#Paragraphs
#Sentences
Connexor
#Morpho
#Phrase
#Sentences
#Syntax
CoreNLP
#Constituency
#Morpho
#NamedEntities
#Sentences
CMC
#Morpho
DeReKo
#Structure
DGD
#Morpho
#Structure
DRuKoLa
#Morpho
Glemm
#Morpho
HNC
#Morpho
LWC
#Dependency
Malt
#Dependency
MarMoT
#Morpho
Mate
#Dependency
#Morpho
MDParser
#Dependency
OpenNLP
#Morpho
#Sentences
Sgbr
#Lemma
#Morpho
Talismane
#Dependency
#Morpho
TreeTagger
#Morpho
#Sentences
XIP
#Constituency
#Morpho
#Sentences
More importers are in preparation.
New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
See the built-in annotation importers as examples.
=head1 About KorAP-XML
KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
data model (Bański et al. 2013), where text data are stored physically
separated from their interpretations (i.e. annotations).
A text document in KorAP-XML therefore consists of several files
containing primary data, metadata and annotations.
The structure of a single KorAP-XML document can be as follows:
- data.xml
- header.xml
+ base
- tokens.xml
- ...
+ struct
- structure.xml
- ...
+ corenlp
- morpho.xml
- constituency.xml
- ...
+ tree_tagger
- morpho.xml
- ...
- ...
The C<data.xml> contains the primary data, the C<header.xml> contains
the metadata, and the annotation layers are stored in subfolders
like C<base>, C<struct> or C<corenlp>
(so-called "foundries"; Bański et al. 2013).
Metadata is available in the TEI-P5 variant I5
(Lüngen and Sperberg-McQueen 2012). See the documentation in
L<KorAP::XML::Meta::I5> for translatable fields.
Annotations correspond to a variant of the TEI-P5 feature structures
(TEI Consortium; Lee et al. 2004).
Multiple KorAP-XML documents are organized on three levels following
the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
corpus E<gt> document E<gt> text. On each level metadata information
can be stored, that C<korapxml2krill> will merge to a single metadata
object per text. A corpus is therefore structured as follows:
+ <corpus>
- header.xml
+ <document>
- header.xml
+ <text>
- data.xml
- header.xml
- ...
- ...
A single text can be identified by the concatenation of
the corpus identifier, the document identifier and the text identifier.
This identifier is called the text sigle
(e.g. a text with the identifier C<18486> in the document C<060> in the
corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
These corpora are often stored in zip files, with which C<korapxml2krill>
can deal with. Corpora may also be split in multiple zip archives
(e.g. one zip file per foundry), which is also supported (see C<--input>).
Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
in form of a test suite.
The resulting JSON format merges all annotation layers
based on a single token stream.
=head2 References
Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
KorAP data model: first approximation, December.
Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
"The New IDS Corpus Analysis Platform: Challenges and Prospects",
Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
"Robust corpus architecture: a new look at virtual collections and data access",
Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
"Towards an international standard on featurestructure representation",
Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
pp. 373-376.
L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
Harald Lüngen and C. M. Sperberg-McQueen (2012):
"A TEI P5 Document Grammar for the IDS Text Model",
Journal of the Text Encoding Initiative, Issue 3 | November 2012.
L<PDF|https://journals.openedition.org/jtei/pdf/508>
TEI Consortium, eds:
"Feature Structures",
Guidelines for Electronic Text Encoding and Interchange.
L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
=head1 AVAILABILITY
https://github.com/KorAP/KorAP-XML-Krill
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: L<Nils Diewald|https://nils-diewald.de/>
Contributor: Eliza Margaretha
L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
This program is free software published under the
L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
=cut