script/korapxml2krill - KorAP/KorAP-XML-Krill - Gitiles

 #!/usr/bin/env perl
 use strict;
 use warnings;
 use FindBin;
 BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
 use File::Spec::Functions qw/catfile catdir/;
 use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
 use Benchmark qw/:hireswallclock/;
 use IO::Compress::Gzip qw/$GzipError/;
 use Log::Log4perl;
 use Pod::Usage;
 use Cache::FastMmap;
 use Directory::Iterator;
 use KorAP::XML::Krill;
 use KorAP::XML::Archive;
 use KorAP::XML::Tokenizer;
 use KorAP::XML::Batch::File;
 use Parallel::ForkManager;
 # TODO: use Parallel::Loops
 # TODO: make output files

 # CHANGES:
 # ----------------------------------------------------------
 # 2013/11/25
 # - Initial release
 #
 # 2014/10/29
 # - Merges foundry data to create indexer friendly documents
 #
 # 2016/02/04
 # - renamed to korapxml2krill
 # - added Schreibgebrauch support
 #
 # 2016/02/12
 # - fixed foundry skipping
 # - Support overwrite in archive processing
 #
 # 2016/02/14
 # - Added version information
 # - Added support for archive files
 #
 # 2016/02/15
 # - Fixed temporary directory bug
 # - Improved skipping before unzipping
 # - Added EXPERIMENTAL concurrency support
 #
 # 2016/02/23
 # - Merge korapxml2krill and korapxml2krill_dir
 #
 # 2016/02/27
 # - Added extract function
 #
 # 2016/03/17
 # - Added meta switch
 #
 # 2016/03/18
 # - Added meta data caching
 #
 # 2016/06/27
 # - Added multi archive support
 # - Added prefix negation support
 # - Added Malt#Dependency support
 #
 # 2016/07/06
 # - Added MDParser#Dependency
 # ----------------------------------------------------------

 our $LAST_CHANGE = '2016/08/16';
 our $LOCAL = $FindBin::Bin;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
 VERSION

 # Parse comand
 my $cmd;
 our @ARGV;
 if ($ARGV[0] && index($ARGV[0], '-') != 0) {
   $cmd = shift @ARGV;
 };

 my (@skip, @sigle, @anno, @input);
 my $text;

 # Parse options from the command line
 GetOptions(
   'input|i=s'   => \@input,
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
   'meta|m=s'    => \(my $meta),
   'token|t=s'   => \(my $token_base),
   'gzip|z'      => \(my $gzip),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
   'cache|c=s'   => \(my $cache_file = 'korapxml2krill.cache'),
   'cache-size|cs=s'   => \(my $cache_size = '50m'),
   'cache-delete|cd!' => \(my $cache_delete = 1),
   'cache-init|ci!'   => \(my $cache_init = 1),
   'log|l=s'     => \(my $log_level = 'ERROR'),
   'anno|a=s'    => \@anno,
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs = 0),
   'help|h'      => sub {
     pod2usage(
       -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
       -verbose => 99,
       -msg => $VERSION_MSG,
     );
   },
   'version|v'   => sub {
     pod2usage(
       -verbose => 0,
       -msg => $VERSION_MSG
     )
   }
 );

 my %ERROR_HASH = (
   -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
   -verbose => 99,
   -msg => $VERSION_MSG,
   -exit => 1
 );

 # Input has to be defined
 pod2usage(%ERROR_HASH) unless @input;

 # Gzip has no effect, if no output is given
 pod2usage(%ERROR_HASH) if $gzip && !$output;

 # Initialize log4perl object
 Log::Log4perl->init({
   'log4perl.rootLogger' => uc($log_level) . ', STDERR',
   'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
   'log4perl.appender.STDERR.layout' => 'PatternLayout',
   'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
 });

 my $log = Log::Log4perl->get_logger('main');

 my %skip;
 $skip{lc($_)} = 1 foreach @skip;

 my @layers;
 push(@layers, ['Base', 'Sentences']);
 push(@layers, ['Base', 'Paragraphs']);

 # Connexor
 push(@layers, ['Connexor', 'Morpho']);
 push(@layers, ['Connexor', 'Syntax']);
 push(@layers, ['Connexor', 'Phrase']);
 push(@layers, ['Connexor', 'Sentences']);

 # CoreNLP
 push(@layers, ['CoreNLP', 'NamedEntities']);
 push(@layers, ['CoreNLP', 'Sentences']);
 push(@layers, ['CoreNLP', 'Morpho']);
 push(@layers, ['CoreNLP', 'Constituency']);

 # DeReKo
 push(@layers, ['DeReKo', 'Structure']);

 # Glemm
 push(@layers, ['Glemm', 'Morpho']);

 # Malt
 push(@layers, ['Malt', 'Dependency']);

 # MDParser
 push(@layers, ['MDParser', 'Dependency']);

 # Mate
 push(@layers, ['Mate', 'Morpho']);
 push(@layers, ['Mate', 'Dependency']);

 # OpenNLP
 push(@layers, ['OpenNLP', 'Morpho']);
 push(@layers, ['OpenNLP', 'Sentences']);

 # Schreibgebrauch
 push(@layers, ['Sgbr', 'Lemma']);
 push(@layers, ['Sgbr', 'Morpho']);

 # TreeTagger
 push(@layers, ['TreeTagger', 'Morpho']);
 push(@layers, ['TreeTagger', 'Sentences']);

 # XIP
 push(@layers, ['XIP', 'Morpho']);
 push(@layers, ['XIP', 'Constituency']);
 push(@layers, ['XIP', 'Sentences']);
 push(@layers, ['XIP', 'Dependency']);

 # Check filters
 my @filtered_anno;
 if ($skip{'#all'}) {
   foreach (@anno) {
     push @filtered_anno, [ split('#', $_) ];
   };
 }

 # Add all annotations that are not skipped
 else {
   # Add to index file - respect skipping
   foreach my $info (@layers) {
     # Skip if Foundry or Foundry#Layer should be skipped
     unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
       push @filtered_anno, $info;
     };
   };
 };

 # Get tokenization basis
 my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);

 # TODO: This should not be initialized for batch
 my $cache = Cache::FastMmap->new(
   share_file => $cache_file,
   cache_size => $cache_size,
   init_file => $cache_init
 );

 my $batch_file = KorAP::XML::Batch::File->new(
   cache => $cache,
   meta_type => $meta,
   overwrite => $overwrite,
   foundry => $token_base_foundry,
   layer => $token_base_layer,
   gzip => $gzip,
   log => $log,
   primary => $primary,
   pretty => $pretty,
   anno => \@filtered_anno
 );


 # Get file name based on path information
 sub get_file_name ($) {
   my $i = $input[0];
   my $file = shift;
   $file =~ s!^/?tmp/[^/]+!!;
   $file =~ s/^?\/?$i//;
   $file =~ tr/\//-/;
   $file =~ s{^-+}{};
   return $file;
 };


 # Write file
 #sub write_file {
 #  my $anno = shift;
 #  my $file = get_file_name $anno;
 #
 #  # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
 #
 #  my $call = 'perl ' . $LOCAL . '/korapxml2krill';
 #  $call .= ' -i ' . $anno;
 #  $call .= ' -o ' . $output . '/' . $file . '.json';
 #  $call .= '.gz -z' if $gzip;
 #  $call .= ' -m ' . $meta if $meta;
 #  $call .= ' -w' if $overwrite;
 #  $call .= ' -t ' . $token_base if $token_base;
 #  $call .= ' -l ' . $log_level if $log_level;
 #  $call .= ' -c ' . $cache_file;
 #  $call .= ' -cs ' . $cache_size;
 #  $call .= ' --no-cache-delete'; # Don't delete the cache
 #  $call .= ' --no-cache-init'; # Don't initialize the cache
 #  $call .= ' --no-primary ' if $primary;
 #  $call .= ' -y ' . $pretty if $pretty;
 #  $call .= ' -a ' . $_ foreach @anno;
 #  $call .= ' -s ' . $_ foreach @skip;
 #  system($call);
 #  return "$file";
 #};


 # Convert sigle to path construct
 s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;

 # Process a single file
 unless ($cmd) {
   my $input = $input[0];

   BEGIN {
     $main::TIME = Benchmark->new;
     $main::LAST_STOP = Benchmark->new;
   };

   sub stop_time {
     my $new = Benchmark->new;
     $log->info(
       'The code took: '.
         timestr(timediff($new, $main::LAST_STOP)) .
         ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
       );
     $main::LAST_STOP = $new;
   };

   # Create and parse new document
   $input =~ s{([^/])$}{$1/};

   $batch_file->process($input, $output);

   # Delete cache file
   unlink($cache_file) if $cache_delete;

   stop_time;
 }

 # Extract XML files
 elsif ($cmd eq 'extract') {

 warn '!!!!!!!!!!!!!------------> ';

 if ($output && (!-e $output || !-d $output)) {
   print "Directory '$output' does not exist.\n\n";
   exit(0);
 };


   # TODO: Support sigles and full archives

   if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {

     unless ($archive->test_unzip) {
       print "Unzip is not installed or incompatible.\n\n";
       exit(1);
     };

     # Add further annotation archived
     $archive->attach($_) foreach @input;

     # Iterate over all given sigles and extract
     foreach (@sigle) {
       print "$_ ";
       print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
       print "extracted.\n";
     };

     print "\n";
     exit(1);
   }
   else {
     $log->error('Unable to extract from primary archive ' . $input[0]);
   };
 }

 # Process an archive
 elsif ($cmd eq 'archive') {

 warn '!!!!!!!!!!!!!------------> ';

 if ($output && (!-e $output || !-d $output)) {
   print "Directory '$output' does not exist.\n\n";
   exit(0);
 };


   # TODO: Support sigles

   if ($output && (!-e $output || !-d $output)) {
     print "Directory '$output' does not exist.\n\n";
     exit(0);
   };

   # Zero means: everything runs in the parent process
   my $pool = Parallel::ForkManager->new($jobs);

   my $count = 0; # Texts to process
   my $iter  = 1;  # Current text in process

   # Report on fork message
   $pool->run_on_finish (
     sub {
       my ($pid, $code) = shift;
       my $data = pop;
       print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
 	($iter++) . "/$count]" .
 	  ($code ? " $code" : '') .
 	    " $$data\n";
     }
   );

   my $t;
   print "Reading data ...\n";

 #  unless (Cache::FastMmap->new(
 #    share_file => $cache_file,
 #    cache_size => $cache_size,
 #    init_file => $cache_init
 #  )) {
 #    print "Unable to intialize cache '$cache_file'\n\n";
 #    exit(1);
 #  };

   # Input is a directory
   if (-d $input[0]) {
     my $it = Directory::Iterator->new($input[0]);
     my @dirs;
     my $dir;

     while (1) {
       if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
 	push @dirs, $dir;
 	$it->prune;
       };
       last unless $it->next;
     };

     print "Start processing ...\n";
     $t = Benchmark->new;
     $count = scalar @dirs;

   DIRECTORY_LOOP:
     for (my $i = 0; $i < $count; $i++) {

       my $filename = catfile(
 	$output,
 	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
       );

       # Get the next fork
       my $pid = $pool->start and next DIRECTORY_LOOP;
       my $msg;

       $msg = $batch_file->process($dirs[$i] => $filename);
       $pool->finish(0, \$msg);
     };
   }

   # Input is a file
   elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {

     unless ($archive->test_unzip) {
       print "Unzip is not installed or incompatible.\n\n";
       exit(1);
     };

     # Add further annotation archived
     $archive->attach($_) foreach @input;

     print "Start processing ...\n";
     $t = Benchmark->new;
     my @dirs = $archive->list_texts;
     $count = scalar @dirs;

   ARCHIVE_LOOP:
     for (my $i = 0; $i < $count; $i++) {

       # Split path information
       my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);

       my $filename = catfile(
 	$output,
 	get_file_name(
 	  catfile($corpus, $doc, $text)
 	    . '.json' . ($gzip ? '.gz' : '')
 	  )
       );

       # Get the next fork
       my $pid = $pool->start and next ARCHIVE_LOOP;

       # Create temporary file
       my $temp = File::Temp->newdir;

       my $msg;

       # Extract from archive
       if ($archive->extract($dirs[$i], $temp)) {

 	# Create corpus directory
 	my $input = catdir("$temp", $corpus);

 	# Temporary directory
 	my $dir = catdir($input, $doc, $text);

 	# Write file
 	$msg = $batch_file->process($dir => $output);

 	$temp = undef;
 	$pool->finish(0, \$msg);
       }
       else {

 	$temp = undef;
 	$msg = "Unable to extract " . $dirs[$i] . "\n";
 	$pool->finish(1, \$msg);
       };
     };
   }

   else {
     print "Input is neither a directory nor an archive.\n\n";
   };

   $pool->wait_all_children;

   # Delete cache file
   unlink($cache_file) if $cache_delete;

   print "Done.\n";
   print timestr(timediff(Benchmark->new, $t))."\n\n";
 }

 # Unknown command
 else {
   warn "Unknown command '$cmd'.\n\n";
   pod2usage(%ERROR_HASH);
 }

 __END__

 =pod

 =encoding utf8

 =head1 NAME

 korapxml2krill - Merge KorapXML data and create Krill documents


 =head1 SYNOPSIS

   $ korapxml2krill -z --input <directory> --output <filename>
   $ korapxml2krill archive -z --input <directory> --output <directory>
   $ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>


 =head1 DESCRIPTION

 L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
 compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
 The C<korapxml2krill> command line tool is a simple wrapper to the library.


 =head1 INSTALLATION

 The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.

   $ cpanm https://github.com/KorAP/KorAP-XML-Krill

 In case everything went well, the C<korapxml2krill> tool will
 be available on your command line immediately.


 =head1 ARGUMENTS

 =over 2

 =item B<archive>

 Process an archive as a Zip-file or a folder of KorAP-XML documents.

 =item B<extract>

 Extract KorAP-XML files from a Zip-file.

 =back


 =head1 OPTIONS

 =over 2

 =item B<--input|-i> <directory|file|files>

 Directory or archive file of documents to convert.

 Archiving supports multiple input archives with the constraint,
 that the first archive listed contains all primary data files
 and all meta data files.

   -i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip

 (The directory structure follows the base directory format,
 that may include a C<.> root folder.
 In this case further archives lacking a C<.> root folder
 need to be passed with a hash sign in front of the archive's name.)

 =item B<--output|-o> <directory|file>

 Output folder for archive processing or
 document name for single output (optional),
 writes to C<STDOUT> by default
 (in case C<output> is not mandatory due to further options).

 =item B<--overwrite|-w>

 Overwrite files that already exist.

 =item B<--token|-t> <foundry>[#<file>]

 Define the default tokenization by specifying
 the name of the foundry and optionally the name
 of the layer-file. Defaults to C<OpenNLP#tokens>.

 =item B<--skip|-s> <foundry>[#<layer>]

 Skip specific annotations by specifying the foundry
 (and optionally the layer with a C<#>-prefix),
 e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
 Can be set multiple times.

 =item B<--anno|-a> <foundry>#<layer>

 Convert specific annotations by specifying the foundry
 (and optionally the layer with a C<#>-prefix),
 e.g. C<Mate> or C<Mate#Morpho>.
 Can be set multiple times.

 =item B<--primary|-p>

 Output primary data or not. Defaults to C<true>.
 Can be flagged using C<--no-primary> as well.
 This is I<deprecated>.

 =item B<--jobs|-j>

 Define the number of concurrent jobs in seperated forks
 for archive processing.
 Defaults to C<0> (everything runs in a single process).
 This is I<experimental>.

 =item B<--meta|-m>

 Define the metadata parser to use. Defaults to C<I5>.
 Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
 This is I<experimental>.

 =item B<--pretty|-y>

 Pretty print JSON output. Defaults to C<false>.
 This is I<deprecated>.

 =item B<--gzip|-z>

 Compress the output.
 Expects a defined C<output> file in single processing.

 =item B<--cache|-c>

 File to mmap a cache (using L<Cache::FastMmap>).
 Defaults to C<korapxml2krill.cache> in the calling directory.

 =item B<--cache-size|-cs>

 Size of the cache. Defaults to C<50m>.

 =item B<--cache-init|-ci>

 Initialize cache file.
 Can be flagged using C<--no-cache-init> as well.
 Defaults to C<true>.

 =item B<--cache-delete|-cd>

 Delete cache file after processing.
 Can be flagged using C<--no-cache-delete> as well.
 Defaults to C<true>.

 =item B<--sigle|-sg>

 Extract the given text sigles.
 Can be set multiple times.
 I<Currently only supported on C<extract>.>
 Sigles have the structure C<Corpus>/C<Document>/C<Text>.

 =item B<--log|-l>

 The L<Log4perl> log level, defaults to C<ERROR>.

 =item B<--help|-h>

 Print this document.

 =item B<--version|-v>

 Print version information.

 =back

 =head1 ANNOTATION SUPPORT

 L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
 developed in the KorAP project that are part of the KorAP preprocessing pipeline.
 The base foundry with paragraphs, sentences, and the text element are mandatory for
 L<Krill|https://github.com/KorAP/Krill>.

 =over 2

 =item B<Base>

 =over 4

 =item #Paragraphs

 =item #Sentences

 =back

 =item B<Connexor>

 =over 4

 =item #Morpho

 =item #Phrase

 =item #Sentences

 =item #Syntax

 =back

 =item B<CoreNLP>

 =over 4

 =item #Constituency

 =item #Morpho

 =item #NamedEntities

 =item #Sentences

 =back

 =item B<DeReKo>

 =over 4

 =item #Structure

 =back

 =item B<Glemm>

 =over 4

 =item #Morpho

 =back

 =item B<Mate>

 =over 4

 =item #Dependency

 =item #Morpho

 =back

 =item B<OpenNLP>

 =over 4

 =item #Morpho

 =item #Sentences

 =back

 =item B<Sgbr>

 =over 4

 =item #Lemma

 =item #Morpho

 =back

 =item B<TreeTagger>

 =over 4

 =item #Morpho

 =item #Sentences

 =back

 =item B<XIP>

 =over 4

 =item #Constituency

 =item #Morpho

 =item #Sentences

 =back

 =back

 More importers are in preparation.
 New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
 See the built-in annotation importers as examples.

 =head1 AVAILABILITY

   https://github.com/KorAP/KorAP-XML-Krill


 =head1 COPYRIGHT AND LICENSE

 Copyright (C) 2015-2016, L<IDS Mannheim|http://www.ids-mannheim.de/>

 Author: L<Nils Diewald|http://nils-diewald.de/>

 L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the
 L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
 member of the
 L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.

 This program is free software published under the
 L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.

 =cut
	#!/usr/bin/env perl
	use strict;
	use warnings;
	use FindBin;
	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
	use File::Spec::Functions qw/catfile catdir/;
	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
	use Benchmark qw/:hireswallclock/;
	use IO::Compress::Gzip qw/$GzipError/;
	use Log::Log4perl;
	use Pod::Usage;
	use Cache::FastMmap;
	use Directory::Iterator;
	use KorAP::XML::Krill;
	use KorAP::XML::Archive;
	use KorAP::XML::Tokenizer;
	use KorAP::XML::Batch::File;
	use Parallel::ForkManager;
	# TODO: use Parallel::Loops
	# TODO: make output files

	# CHANGES:
	# ----------------------------------------------------------
	# 2013/11/25
	# - Initial release
	#
	# 2014/10/29
	# - Merges foundry data to create indexer friendly documents
	#
	# 2016/02/04
	# - renamed to korapxml2krill
	# - added Schreibgebrauch support
	#
	# 2016/02/12
	# - fixed foundry skipping
	# - Support overwrite in archive processing
	#
	# 2016/02/14
	# - Added version information
	# - Added support for archive files
	#
	# 2016/02/15
	# - Fixed temporary directory bug
	# - Improved skipping before unzipping
	# - Added EXPERIMENTAL concurrency support
	#
	# 2016/02/23
	# - Merge korapxml2krill and korapxml2krill_dir
	#
	# 2016/02/27
	# - Added extract function
	#
	# 2016/03/17
	# - Added meta switch
	#
	# 2016/03/18
	# - Added meta data caching
	#
	# 2016/06/27
	# - Added multi archive support
	# - Added prefix negation support
	# - Added Malt#Dependency support
	#
	# 2016/07/06
	# - Added MDParser#Dependency
	# ----------------------------------------------------------

	our $LAST_CHANGE = '2016/08/16';
	our $LOCAL = $FindBin::Bin;
	our $VERSION_MSG = <<"VERSION";
	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
	VERSION

	# Parse comand
	my $cmd;
	our @ARGV;
	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
	$cmd = shift @ARGV;
	};

	my (@skip, @sigle, @anno, @input);
	my $text;

	# Parse options from the command line
	GetOptions(
	'input\|i=s' => \@input,
	'output\|o=s' => \(my $output),
	'overwrite\|w' => \(my $overwrite),
	'meta\|m=s' => \(my $meta),
	'token\|t=s' => \(my $token_base),
	'gzip\|z' => \(my $gzip),
	'skip\|s=s' => \@skip,
	'sigle\|sg=s' => \@sigle,
	'cache\|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
	'cache-size\|cs=s' => \(my $cache_size = '50m'),
	'cache-delete\|cd!' => \(my $cache_delete = 1),
	'cache-init\|ci!' => \(my $cache_init = 1),
	'log\|l=s' => \(my $log_level = 'ERROR'),
	'anno\|a=s' => \@anno,
	'primary\|p!' => \(my $primary),
	'pretty\|y' => \(my $pretty),
	'jobs\|j=i' => \(my $jobs = 0),
	'help\|h' => sub {
	pod2usage(
	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
	-verbose => 99,
	-msg => $VERSION_MSG,
	);
	},
	'version\|v' => sub {
	pod2usage(
	-verbose => 0,
	-msg => $VERSION_MSG
	)
	}
	);

	my %ERROR_HASH = (
	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
	-verbose => 99,
	-msg => $VERSION_MSG,
	-exit => 1
	);

	# Input has to be defined
	pod2usage(%ERROR_HASH) unless @input;

	# Gzip has no effect, if no output is given
	pod2usage(%ERROR_HASH) if $gzip && !$output;

	# Initialize log4perl object
	Log::Log4perl->init({
	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
	'log4perl.appender.STDERR.layout' => 'PatternLayout',
	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
	});

	my $log = Log::Log4perl->get_logger('main');

	my %skip;
	$skip{lc($_)} = 1 foreach @skip;

	my @layers;
	push(@layers, ['Base', 'Sentences']);
	push(@layers, ['Base', 'Paragraphs']);

	# Connexor
	push(@layers, ['Connexor', 'Morpho']);
	push(@layers, ['Connexor', 'Syntax']);
	push(@layers, ['Connexor', 'Phrase']);
	push(@layers, ['Connexor', 'Sentences']);

	# CoreNLP
	push(@layers, ['CoreNLP', 'NamedEntities']);
	push(@layers, ['CoreNLP', 'Sentences']);
	push(@layers, ['CoreNLP', 'Morpho']);
	push(@layers, ['CoreNLP', 'Constituency']);

	# DeReKo
	push(@layers, ['DeReKo', 'Structure']);

	# Glemm
	push(@layers, ['Glemm', 'Morpho']);

	# Malt
	push(@layers, ['Malt', 'Dependency']);

	# MDParser
	push(@layers, ['MDParser', 'Dependency']);

	# Mate
	push(@layers, ['Mate', 'Morpho']);
	push(@layers, ['Mate', 'Dependency']);

	# OpenNLP
	push(@layers, ['OpenNLP', 'Morpho']);
	push(@layers, ['OpenNLP', 'Sentences']);

	# Schreibgebrauch
	push(@layers, ['Sgbr', 'Lemma']);
	push(@layers, ['Sgbr', 'Morpho']);

	# TreeTagger
	push(@layers, ['TreeTagger', 'Morpho']);
	push(@layers, ['TreeTagger', 'Sentences']);

	# XIP
	push(@layers, ['XIP', 'Morpho']);
	push(@layers, ['XIP', 'Constituency']);
	push(@layers, ['XIP', 'Sentences']);
	push(@layers, ['XIP', 'Dependency']);

	# Check filters
	my @filtered_anno;
	if ($skip{'#all'}) {
	foreach (@anno) {
	push @filtered_anno, [ split('#', $_) ];
	};
	}

	# Add all annotations that are not skipped
	else {
	# Add to index file - respect skipping
	foreach my $info (@layers) {
	# Skip if Foundry or Foundry#Layer should be skipped
	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
	push @filtered_anno, $info;
	};
	};
	};

	# Get tokenization basis
	my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);

	# TODO: This should not be initialized for batch
	my $cache = Cache::FastMmap->new(
	share_file => $cache_file,
	cache_size => $cache_size,
	init_file => $cache_init
	);

	my $batch_file = KorAP::XML::Batch::File->new(
	cache => $cache,
	meta_type => $meta,
	overwrite => $overwrite,
	foundry => $token_base_foundry,
	layer => $token_base_layer,
	gzip => $gzip,
	log => $log,
	primary => $primary,
	pretty => $pretty,
	anno => \@filtered_anno
	);


	# Get file name based on path information
	sub get_file_name ($) {
	my $i = $input[0];
	my $file = shift;
	$file =~ s!^/?tmp/[^/]+!!;
	$file =~ s/^?\/?$i//;
	$file =~ tr/\//-/;
	$file =~ s{^-+}{};
	return $file;
	};


	# Write file
	#sub write_file {
	# my $anno = shift;
	# my $file = get_file_name $anno;
	#
	# # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
	#
	# my $call = 'perl ' . $LOCAL . '/korapxml2krill';
	# $call .= ' -i ' . $anno;
	# $call .= ' -o ' . $output . '/' . $file . '.json';
	# $call .= '.gz -z' if $gzip;
	# $call .= ' -m ' . $meta if $meta;
	# $call .= ' -w' if $overwrite;
	# $call .= ' -t ' . $token_base if $token_base;
	# $call .= ' -l ' . $log_level if $log_level;
	# $call .= ' -c ' . $cache_file;
	# $call .= ' -cs ' . $cache_size;
	# $call .= ' --no-cache-delete'; # Don't delete the cache
	# $call .= ' --no-cache-init'; # Don't initialize the cache
	# $call .= ' --no-primary ' if $primary;
	# $call .= ' -y ' . $pretty if $pretty;
	# $call .= ' -a ' . $_ foreach @anno;
	# $call .= ' -s ' . $_ foreach @skip;
	# system($call);
	# return "$file";
	#};


	# Convert sigle to path construct
	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;

	# Process a single file
	unless ($cmd) {
	my $input = $input[0];

	BEGIN {
	$main::TIME = Benchmark->new;
	$main::LAST_STOP = Benchmark->new;
	};

	sub stop_time {
	my $new = Benchmark->new;
	$log->info(
	'The code took: '.
	timestr(timediff($new, $main::LAST_STOP)) .
	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
	);
	$main::LAST_STOP = $new;
	};

	# Create and parse new document
	$input =~ s{([^/])$}{$1/};

	$batch_file->process($input, $output);

	# Delete cache file
	unlink($cache_file) if $cache_delete;

	stop_time;
	}

	# Extract XML files
	elsif ($cmd eq 'extract') {

	warn '!!!!!!!!!!!!!------------> ';

	if ($output && (!-e $output \|\| !-d $output)) {
	print "Directory '$output' does not exist.\n\n";
	exit(0);
	};


	# TODO: Support sigles and full archives

	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {

	unless ($archive->test_unzip) {
	print "Unzip is not installed or incompatible.\n\n";
	exit(1);
	};

	# Add further annotation archived
	$archive->attach($_) foreach @input;

	# Iterate over all given sigles and extract
	foreach (@sigle) {
	print "$_ ";
	print '' . ($archive->extract('./' . $_, $output) ? '' : 'not ');
	print "extracted.\n";
	};

	print "\n";
	exit(1);
	}
	else {
	$log->error('Unable to extract from primary archive ' . $input[0]);
	};
	}

	# Process an archive
	elsif ($cmd eq 'archive') {

	warn '!!!!!!!!!!!!!------------> ';

	if ($output && (!-e $output \|\| !-d $output)) {
	print "Directory '$output' does not exist.\n\n";
	exit(0);
	};


	# TODO: Support sigles

	if ($output && (!-e $output \|\| !-d $output)) {
	print "Directory '$output' does not exist.\n\n";
	exit(0);
	};

	# Zero means: everything runs in the parent process
	my $pool = Parallel::ForkManager->new($jobs);

	my $count = 0; # Texts to process
	my $iter = 1; # Current text in process

	# Report on fork message
	$pool->run_on_finish (
	sub {
	my ($pid, $code) = shift;
	my $data = pop;
	print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
	($iter++) . "/$count]" .
	($code ? " $code" : '') .
	" $$data\n";
	}
	);

	my $t;
	print "Reading data ...\n";

	# unless (Cache::FastMmap->new(
	# share_file => $cache_file,
	# cache_size => $cache_size,
	# init_file => $cache_init
	# )) {
	# print "Unable to intialize cache '$cache_file'\n\n";
	# exit(1);
	# };

	# Input is a directory
	if (-d $input[0]) {
	my $it = Directory::Iterator->new($input[0]);
	my @dirs;
	my $dir;

	while (1) {
	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
	push @dirs, $dir;
	$it->prune;
	};
	last unless $it->next;
	};

	print "Start processing ...\n";
	$t = Benchmark->new;
	$count = scalar @dirs;

	DIRECTORY_LOOP:
	for (my $i = 0; $i < $count; $i++) {

	my $filename = catfile(
	$output,
	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
	);

	# Get the next fork
	my $pid = $pool->start and next DIRECTORY_LOOP;
	my $msg;

	$msg = $batch_file->process($dirs[$i] => $filename);
	$pool->finish(0, \$msg);
	};
	}

	# Input is a file
	elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {

	unless ($archive->test_unzip) {
	print "Unzip is not installed or incompatible.\n\n";
	exit(1);
	};

	# Add further annotation archived
	$archive->attach($_) foreach @input;

	print "Start processing ...\n";
	$t = Benchmark->new;
	my @dirs = $archive->list_texts;
	$count = scalar @dirs;

	ARCHIVE_LOOP:
	for (my $i = 0; $i < $count; $i++) {

	# Split path information
	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);

	my $filename = catfile(
	$output,
	get_file_name(
	catfile($corpus, $doc, $text)
	. '.json' . ($gzip ? '.gz' : '')
	)
	);

	# Get the next fork
	my $pid = $pool->start and next ARCHIVE_LOOP;

	# Create temporary file
	my $temp = File::Temp->newdir;

	my $msg;

	# Extract from archive
	if ($archive->extract($dirs[$i], $temp)) {

	# Create corpus directory
	my $input = catdir("$temp", $corpus);

	# Temporary directory
	my $dir = catdir($input, $doc, $text);

	# Write file
	$msg = $batch_file->process($dir => $output);

	$temp = undef;
	$pool->finish(0, \$msg);
	}
	else {

	$temp = undef;
	$msg = "Unable to extract " . $dirs[$i] . "\n";
	$pool->finish(1, \$msg);
	};
	};
	}

	else {
	print "Input is neither a directory nor an archive.\n\n";
	};

	$pool->wait_all_children;

	# Delete cache file
	unlink($cache_file) if $cache_delete;

	print "Done.\n";
	print timestr(timediff(Benchmark->new, $t))."\n\n";
	}

	# Unknown command
	else {
	warn "Unknown command '$cmd'.\n\n";
	pod2usage(%ERROR_HASH);
	}

	__END__

	=pod

	=encoding utf8

	=head1 NAME

	korapxml2krill - Merge KorapXML data and create Krill documents


	=head1 SYNOPSIS

	$ korapxml2krill -z --input <directory> --output <filename>
	$ korapxml2krill archive -z --input <directory> --output <directory>
	$ korapxml2krill extract --input <directory> --output <filename> --sigle <SIGLE>


	=head1 DESCRIPTION

	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
	The C<korapxml2krill> command line tool is a simple wrapper to the library.


	=head1 INSTALLATION

	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.

	$ cpanm https://github.com/KorAP/KorAP-XML-Krill

	In case everything went well, the C<korapxml2krill> tool will
	be available on your command line immediately.


	=head1 ARGUMENTS

	=over 2

	=item B<archive>

	Process an archive as a Zip-file or a folder of KorAP-XML documents.

	=item B<extract>

	Extract KorAP-XML files from a Zip-file.

	=back


	=head1 OPTIONS

	=over 2

	=item B<--input\|-i> <directory\|file\|files>

	Directory or archive file of documents to convert.

	Archiving supports multiple input archives with the constraint,
	that the first archive listed contains all primary data files
	and all meta data files.

	-i file/news.zip -i file/news.malt.zip -i #file/news.tt.zip

	(The directory structure follows the base directory format,
	that may include a C<.> root folder.
	In this case further archives lacking a C<.> root folder
	need to be passed with a hash sign in front of the archive's name.)

	=item B<--output\|-o> <directory\|file>

	Output folder for archive processing or
	document name for single output (optional),
	writes to C<STDOUT> by default
	(in case C<output> is not mandatory due to further options).

	=item B<--overwrite\|-w>

	Overwrite files that already exist.

	=item B<--token\|-t> <foundry>[#<file>]

	Define the default tokenization by specifying
	the name of the foundry and optionally the name
	of the layer-file. Defaults to C<OpenNLP#tokens>.

	=item B<--skip\|-s> <foundry>[#<layer>]

	Skip specific annotations by specifying the foundry
	(and optionally the layer with a C<#>-prefix),
	e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
	Can be set multiple times.

	=item B<--anno\|-a> <foundry>#<layer>

	Convert specific annotations by specifying the foundry
	(and optionally the layer with a C<#>-prefix),
	e.g. C<Mate> or C<Mate#Morpho>.
	Can be set multiple times.

	=item B<--primary\|-p>

	Output primary data or not. Defaults to C<true>.
	Can be flagged using C<--no-primary> as well.
	This is I<deprecated>.

	=item B<--jobs\|-j>

	Define the number of concurrent jobs in seperated forks
	for archive processing.
	Defaults to C<0> (everything runs in a single process).
	This is I<experimental>.

	=item B<--meta\|-m>

	Define the metadata parser to use. Defaults to C<I5>.
	Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
	This is I<experimental>.

	=item B<--pretty\|-y>

	Pretty print JSON output. Defaults to C<false>.
	This is I<deprecated>.

	=item B<--gzip\|-z>

	Compress the output.
	Expects a defined C<output> file in single processing.

	=item B<--cache\|-c>

	File to mmap a cache (using L<Cache::FastMmap>).
	Defaults to C<korapxml2krill.cache> in the calling directory.

	=item B<--cache-size\|-cs>

	Size of the cache. Defaults to C<50m>.

	=item B<--cache-init\|-ci>

	Initialize cache file.
	Can be flagged using C<--no-cache-init> as well.
	Defaults to C<true>.

	=item B<--cache-delete\|-cd>

	Delete cache file after processing.
	Can be flagged using C<--no-cache-delete> as well.
	Defaults to C<true>.

	=item B<--sigle\|-sg>

	Extract the given text sigles.
	Can be set multiple times.
	I<Currently only supported on C<extract>.>
	Sigles have the structure C<Corpus>/C<Document>/C<Text>.

	=item B<--log\|-l>

	The L<Log4perl> log level, defaults to C<ERROR>.

	=item B<--help\|-h>

	Print this document.

	=item B<--version\|-v>

	Print version information.

	=back

	=head1 ANNOTATION SUPPORT

	L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
	developed in the KorAP project that are part of the KorAP preprocessing pipeline.
	The base foundry with paragraphs, sentences, and the text element are mandatory for
	L<Krill\|https://github.com/KorAP/Krill>.

	=over 2

	=item B<Base>

	=over 4

	=item #Paragraphs

	=item #Sentences

	=back

	=item B<Connexor>

	=over 4

	=item #Morpho

	=item #Phrase

	=item #Sentences

	=item #Syntax

	=back

	=item B<CoreNLP>

	=over 4

	=item #Constituency

	=item #Morpho

	=item #NamedEntities

	=item #Sentences

	=back

	=item B<DeReKo>

	=over 4

	=item #Structure

	=back

	=item B<Glemm>

	=over 4

	=item #Morpho

	=back

	=item B<Mate>

	=over 4

	=item #Dependency

	=item #Morpho

	=back

	=item B<OpenNLP>

	=over 4

	=item #Morpho

	=item #Sentences

	=back

	=item B<Sgbr>

	=over 4

	=item #Lemma

	=item #Morpho

	=back

	=item B<TreeTagger>

	=over 4

	=item #Morpho

	=item #Sentences

	=back

	=item B<XIP>

	=over 4

	=item #Constituency

	=item #Morpho

	=item #Sentences

	=back

	=back

	More importers are in preparation.
	New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
	See the built-in annotation importers as examples.

	=head1 AVAILABILITY

	https://github.com/KorAP/KorAP-XML-Krill


	=head1 COPYRIGHT AND LICENSE

	Copyright (C) 2015-2016, L<IDS Mannheim\|http://www.ids-mannheim.de/>

	Author: L<Nils Diewald\|http://nils-diewald.de/>

	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
	Corpus Analysis Platform at the
	L<Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
	member of the
	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.

	This program is free software published under the
	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.

	=cut