script/korapxml2krill - KorAP/KorAP-XML-Krill - Gitiles

 #!/usr/bin/env perl
 use strict;
 use warnings;
 use FindBin;
 BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
 use File::Spec::Functions qw/catfile catdir/;
 use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
 use Benchmark qw/:hireswallclock/;
 use IO::Compress::Gzip qw/$GzipError/;
 use POSIX qw/ceil/;
 use Log::Log4perl;
 use Pod::Usage;
 use Cache::FastMmap;
 use Directory::Iterator;
 use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
 use KorAP::XML::Archive;
 use KorAP::XML::Tokenizer;
 use KorAP::XML::Batch::File;
 use Config::Simple;
 use Parallel::ForkManager;
 use v5.10;
 use Sys::Info;
 use Sys::Info::Constants qw( :device_cpu );
 use File::Glob ':bsd_glob';
 use File::Temp qw/tempdir/;
 use File::Path qw(remove_tree make_path);
 use File::Basename;
 use Mojo::Collection 'c';
 use String::Random qw(random_string);
 use IO::File;
 use Archive::Tar::Builder;
 use Fcntl qw(:flock SEEK_END);

 # use KorAP::XML::ForkPool;
 # TODO: use Parallel::Loops
 # TODO: make output files

 # TODO: Use KorAP::XML::ForkPool!

 # CHANGES:
 # ----------------------------------------------------------
 # 2013/11/25
 # - Initial release
 #
 # 2014/10/29
 # - Merges foundry data to create indexer friendly documents
 #
 # 2016/02/04
 # - renamed to korapxml2krill
 # - added Schreibgebrauch support
 #
 # 2016/02/12
 # - fixed foundry skipping
 # - Support overwrite in archive processing
 #
 # 2016/02/14
 # - Added version information
 # - Added support for archive files
 #
 # 2016/02/15
 # - Fixed temporary directory bug
 # - Improved skipping before unzipping
 # - Added EXPERIMENTAL concurrency support
 #
 # 2016/02/23
 # - Merge korapxml2krill and korapxml2krill_dir
 #
 # 2016/02/27
 # - Added extract function
 #
 # 2016/03/17
 # - Added meta switch
 #
 # 2016/03/18
 # - Added meta data caching
 #
 # 2016/06/27
 # - Added multi archive support
 # - Added prefix negation support
 # - Added Malt#Dependency support
 #
 # 2016/07/06
 # - Added MDParser#Dependency
 #
 # 2016/10/15
 # - Fixed temporary path issue in script
 #
 # 2016/10/24
 # - Improved Windows support
 #
 # 2016/10/24
 # - Added support for document extraction
 #
 # 2016/10/27
 # - Added wildcard support for document extraction
 #
 # 2016/12/21
 # - added support for base-sentences and base-tokenizations
 #
 # 2017/01/20
 # - added support for DRuKoLa annotations
 #
 # 2017/02/08
 # - added support for pagebreak annotations
 #
 # 2017/04/06
 # - added support for wildcards in input
 #
 # 2017/04/07
 # - support configuration option
 # - support for temporary extraction
 #
 # 2017/04/12
 # - support serial processing
 # - support input root
 # - introduced --sequential-extraction flag
 #
 # 2017/06/19
 # - added support for DCK
 #
 # 2017/06/29
 # - Fixed exit codes
 #
 # 2017/07/04
 # - Fixed tar building process
 #
 # 2018/01/16
 # - Added LWC support
 #
 # 2018/07/19
 # - Preliminary support for HNC.
 #
 # 2019/01/22
 # - Preliminary support for DGD.
 # - Support for non-word tokens.
 #
 # 2019/02/13
 # - Support for 'koral:field' array.
 # - Support for Koral versioning.
 # - Ignore temporary extract parameter on
 #   directory archiving.
 #
 # 2019/08/08
 # - Support for Talismane.
 #
 # 2019/12/17
 # - Added support for DGD pseudo-sentences
 #   based on anchor milestones.
 # - Support for non-verbal annotations.
 #
 # 2020/04/23
 # - Added support for Redewiedergabe-Korpus structure
 #   annotations, based on sentence and paragraph milestones
 # - Added support for Redewiedergabe-Korpus morphology
 # ----------------------------------------------------------

 our $LAST_CHANGE = '2020/04/23';
 our $LOCAL = $FindBin::Bin;
 our $KORAL_VERSION = 0.03;
 our $VERSION_MSG = <<"VERSION";
 Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
 VERSION

 # Prototypes
 sub get_file_name($$);

 # Parse comand
 my $cmd;
 our @ARGV;
 if ($ARGV[0] && index($ARGV[0], '-') != 0) {
   $cmd = shift @ARGV;
 };
 my @keep_argv = @ARGV;

 my (@skip, @sigle, @anno, @input);
 my $text;

 # Parse options from the command line
 GetOptions(
   'input|i=s'   => \@input,
   'input-base|ib=s' => \(my $input_base),
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
   'meta|m=s'    => \(my $meta),
   'token|t=s'   => \(my $token_base),
   'base-sentences|bs=s'   => \(my $base_sentences),
   'base-paragraphs|bp=s'  => \(my $base_paragraphs),
   'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
   'gzip|z'      => \(my $gzip),
   'temporary-extract|te=s' => \(my $extract_dir),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
   'cache|c=s'   => \(my $cache_file),
   'config|cfg=s' => \(my $cfg_file),
   'log|l=s'     => \(my $log_level),
   'anno|a=s'    => \@anno,
   'primary|p!'  => \(my $primary),
   'pretty|y'    => \(my $pretty),
   'jobs|j=i'    => \(my $jobs),
   'koral|k=f'    => \(my $koral),
   'to-tar'      => \(my $to_tar),
   'non-word-tokens|nwt' => \(my $non_word_tokens),
   'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
   'sequential-extraction|se' => \(my $sequential_extraction),
   'cache-size|cs=s'  => \(my $cache_size),
   'cache-delete|cd!' => \(my $cache_delete),
   'cache-init|ci!'   => \(my $cache_init),
   'help|h'      => sub {
     pod2usage(
       -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
       -verbose  => 99,
       -msg      => $VERSION_MSG,
       -output   => '-'
     );
   },
   'version|v'   => sub {
     pod2usage(
       -verbose  => 0,
       -msg      => $VERSION_MSG,
       -output   => '-'
     )
   }
 );


 # Load from configuration
 if ($cfg_file && -e $cfg_file) {
   my %config;

   Config::Simple->import_from($cfg_file, \%config);

   # Overwrite
   if (!defined($overwrite) && defined $config{overwrite}) {
     $overwrite = $config{overwrite};
   };

   # Gzip
   if (!defined($gzip) && defined $config{gzip}) {
     $gzip = $config{gzip};
   };

   # Jobs
   if (!defined($jobs) && defined $config{jobs}) {
     $jobs = $config{jobs};
   };

   # Koral version
   if (!defined($koral) && defined $config{koral}) {
     $koral = $config{koral};
   };

   # Input root base directory
   if (!defined($input_base) && defined $config{'input-base'}) {
     $input_base = $config{'input-base'};
   };

   # temporary-extract
   if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
     $extract_dir = $config{'temporary-extract'};
   };

   # Token base
   if (!defined($token_base) && defined $config{token}) {
     $token_base = $config{token};
   };

   # Non-word tokenization
   if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
     $non_word_tokens = $config{'non-word-tokens'};
   };

   # Non-verbal tokenization
   if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
     $non_verbal_tokens = $config{'non-verbal-tokens'};
   };

   # Cache file
   if (!defined($cache_file) && defined $config{cache}) {
     $cache_file = $config{cache};
   };

   # Cache size
   if (!defined($cache_size) && defined $config{'cache-size'}) {
     $cache_size = $config{'cache-size'};
   };

   # Cache delete
   if (!defined($cache_delete) && defined $config{'cache-delete'}) {
     $cache_delete = $config{'cache-delete'} ;
   };

   # Cache init
   if (!(defined $cache_init) && defined $config{'cache-init'}) {
     $cache_init = $config{'cache-init'} ;
   };

   # Jobs for extraction
   if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
     $sequential_extraction = $config{'sequential-extraction'} ;
   };

   # Meta
   if (!(defined $meta) && defined $config{'meta'}) {
     $meta = $config{'meta'} ;
   };

   # Output
   if (!(defined $output) && defined $config{'output'}) {
     $output = $config{'output'} ;
   };

   # Base-sentences
   if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
     $base_sentences = $config{'base-sentences'} ;
   };

   # Base-paragraphs
   if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
     $base_paragraphs = $config{'base-paragraphs'} ;
   };

   # Base-pagebreaks
   if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
     $base_pagebreaks = $config{'base-pagebreaks'} ;
   };

   # Write to tar
   if (!(defined $to_tar) && defined $config{'to-tar'}) {
     $to_tar = $config{'to-tar'} ;
   };

   # Log
   if (!(defined $log_level) && defined $config{'log'}) {
     $log_level = $config{'log'} ;
   };

   # Skip
   if (!scalar(@skip) && defined $config{'skip'}) {
     @skip = split /\s*;\s*/, $config{'skip'} ;
   };

   # Sigle
   if (!scalar(@sigle) && defined $config{'sigle'}) {
     @sigle = split /\s*;\s*/, $config{'sigle'} ;
   };

   # Anno
   if (!scalar(@anno) && defined $config{'anno'}) {
     @anno = split /\s*;\s*/, $config{'anno'} ;
   };
 };


 # Set default token base
 $token_base          //= 'OpenNLP#tokens';
 $cache_file          //= 'korapxml2krill.cache';
 $cache_size          //= '50m';
 $jobs                //= 0;
 $koral               //= $KORAL_VERSION;
 $cache_delete        //= 1;
 $cache_init          //= 1;
 $sequential_extraction //= 0;
 $log_level           //= 'ERROR';
 $base_sentences      //= '';
 $base_paragraphs     //= '';
 $base_pagebreaks     //= '';
 $non_word_tokens     //= 0;
 $non_verbal_tokens   //= 0;

 $base_sentences  = lc $base_sentences;
 $base_paragraphs = lc $base_paragraphs;
 $base_pagebreaks = lc $base_pagebreaks;


 # Initialize log4perl object
 Log::Log4perl->init({
   'log4perl.rootLogger' => uc($log_level) . ', STDERR',
   'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
   'log4perl.appender.STDERR.layout' => 'PatternLayout',
   'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
 });

 my $log = Log::Log4perl->get_logger('main');


 print "Reading config from $cfg_file\n" if $cfg_file;


 my %ERROR_HASH = (
   -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
   -verbose  => 99,
   -msg      => $VERSION_MSG,
   -output   => '-',
   -exit     => 1
 );

 # Input has to be defined
 pod2usage(%ERROR_HASH) unless @input;

 # Gzip has no effect, if no output is given
 pod2usage(%ERROR_HASH) if $gzip && !$output;


 if ($jobs eq '-1') {
   state $cores = Sys::Info->new->device('CPU')->count;
   $jobs = ceil(5 * $cores);
   $log->info("Run using $jobs jobs on $cores cores");
 };


 # Start serial processing
 if ($cmd && $cmd eq 'serial') {

   if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
     $log->error("Directory '$output' does not exist.");
     exit 1;
   };

   # Remove all inputs
   my $remove_next = 0;
   @keep_argv = @{c(@keep_argv)->grep(
     sub {
       # Input flag
       if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
         $remove_next = 1;
         return 0;
       }

       # input value
       elsif ($remove_next) {
         $remove_next = 0;
         return 0;
       };

       # Pass parameter
       return 1;
     }
   )->to_array};


   # Iterate over all inputs
   foreach (@input) {

     # This will create a directory
     my $new_out = catdir($output, get_file_name_from_glob($_));

     # Create new path, in case the output is not meant to be tarred
     unless ($to_tar) {
       if (make_path($new_out) == 0 && !-d $new_out) {
         $log->error("Can\'t create path $new_out");
         exit 1;
       };
     };

     # Create archive command
     my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
     print "Start serial processing of $_ to $new_out\n";

     # Start archiving
     system @archive_cmd;
   };

   exit;
 };

 my %skip;
 $skip{lc($_)} = 1 foreach @skip;

 my @layers;
 push(@layers, ['Base', 'Sentences']) unless $base_sentences;
 push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;

 # Connexor
 push(@layers, ['Connexor', 'Morpho']);
 push(@layers, ['Connexor', 'Syntax']);
 push(@layers, ['Connexor', 'Phrase']);
 push(@layers, ['Connexor', 'Sentences']);

 # CoreNLP
 push(@layers, ['CoreNLP', 'NamedEntities']);
 push(@layers, ['CoreNLP', 'Sentences']);
 push(@layers, ['CoreNLP', 'Morpho']);
 push(@layers, ['CoreNLP', 'Constituency']);

 # CMC
 push(@layers, ['CMC', 'Morpho']);

 # DeReKo
 my @dereko_attr = ();
 if ($base_sentences eq 'dereko#structure') {
   push @dereko_attr, 'sentences';
 };
 if ($base_paragraphs eq 'dereko#structure') {
   push @dereko_attr, 'paragraphs';
 };

 if ($base_pagebreaks eq 'dereko#structure') {
   push @dereko_attr, 'pagebreaks';
 };

 if ($dereko_attr[0]) {
   push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
 }
 else {
   push(@layers, ['DeReKo', 'Structure']);
 };

 # DGD
 push(@layers, ['DGD', 'Morpho']);
 if ($base_sentences eq 'dgd#structure') {
   push(@layers, ['DGD', 'Structure', 'base-sentence']);
 }

 # DRuKoLa
 push(@layers, ['DRuKoLa', 'Morpho']);

 # Glemm
 push(@layers, ['Glemm', 'Morpho']);

 # HNC
 push(@layers, ['HNC', 'Morpho']);

 # LWC
 push(@layers, ['LWC', 'Dependency']);

 # Malt
 push(@layers, ['Malt', 'Dependency']);

 # Marmot
 push(@layers, ['MarMoT', 'Morpho']);

 # Mate
 push(@layers, ['Mate', 'Morpho']);
 push(@layers, ['Mate', 'Dependency']);

 # MDParser
 push(@layers, ['MDParser', 'Dependency']);

 # OpenNLP
 push(@layers, ['OpenNLP', 'Morpho']);
 push(@layers, ['OpenNLP', 'Sentences']);

 # Redewiedergabe
 push(@layers, ['RWK', 'Morpho']);
 if ($base_sentences eq 'rwk#structure') {
   push(@layers, ['RWK', 'Structure']);
 };

 # Schreibgebrauch
 push(@layers, ['Sgbr', 'Lemma']);
 push(@layers, ['Sgbr', 'Morpho']);

 # Talismane
 push(@layers, ['Talismane', 'Dependency']);
 push(@layers, ['Talismane', 'Morpho']);

 # TreeTagger
 push(@layers, ['TreeTagger', 'Morpho']);
 push(@layers, ['TreeTagger', 'Sentences']);

 # XIP
 push(@layers, ['XIP', 'Morpho']);
 push(@layers, ['XIP', 'Constituency']);
 push(@layers, ['XIP', 'Sentences']);
 push(@layers, ['XIP', 'Dependency']);


 # Check filters
 my @filtered_anno;
 if ($skip{'#all'}) {
   foreach (@anno) {
     push @filtered_anno, [ split('#', $_) ];
   };
 }

 # Add all annotations that are not skipped
 else {
   # Add to index file - respect skipping
   foreach my $info (@layers) {
     # Skip if Foundry or Foundry#Layer should be skipped
     unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
       push @filtered_anno, $info;
     };
   };
 };

 # Get tokenization basis
 my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;

 # Remove file extension
 $token_base_layer =~ s/\.xml$//i;

 # TODO: This should not be initialized for batch
 my $cache = Cache::FastMmap->new(
   share_file => $cache_file,
   cache_size => $cache_size,
   init_file  => $cache_init
 );

 # Create batch object
 my $batch_file = KorAP::XML::Batch::File->new(
   cache     => $cache,
   meta_type => $meta,
   overwrite => $overwrite,
   foundry   => $token_base_foundry,
   layer     => $token_base_layer,
   gzip      => $gzip,
   log       => $log,
   koral     => $koral,
   primary   => $primary,
   pretty    => $pretty,
   anno      => \@filtered_anno,
   non_word_tokens => $non_word_tokens,
   non_verbal_tokens => $non_verbal_tokens
 );

 # Convert sigle to path construct
 s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;

 if ($cmd) {
   if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
     $log->error("Directory '$output' does not exist.");
     exit 1;
   };
 };


 # Glob and prefix files
 if (@input) {

   my @new_input = ();

   # Iterate over all inputs
   foreach my $wild_card (@input) {

     # Prefix with input root
     $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;

     push (@new_input, bsd_glob($wild_card));
   };

   # Sort files by length
   @input = sort { length($a) <=> length($b) } @new_input;

   print 'Input is ' . join(', ', @input)."\n";
 };


 # Process a single file
 unless ($cmd) {
   my $input = $input[0];

   BEGIN {
     $main::TIME = Benchmark->new;
     $main::LAST_STOP = Benchmark->new;
   };

   sub stop_time {
     my $new = Benchmark->new;
     $log->info(
       'The code took: '.
         timestr(timediff($new, $main::LAST_STOP)) .
         ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
       );
     $main::LAST_STOP = $new;
   };

   # Create and parse new document
   $input =~ s{([^/])$}{$1/};

   # Process file
   $batch_file->process($input, $output);

   # Delete cache file
   unlink($cache_file) if $cache_delete;

   stop_time;
   exit;
 };


 # Extract XML files
 if ($cmd eq 'extract') {

   # Output is required
   pod2usage(%ERROR_HASH) unless $output;

   # Create new archive object
   if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {

     # Check zip capabilities
     unless ($archive->test_unzip) {
       $log->error("Unzip is not installed or incompatible.");
       exit 1;
     };

     # Add further annotation archived
     $archive->attach($_) foreach @input[1..$#input];

     # Will set @sigle
     my $prefix = set_sigle($archive);

     # Iterate over all given sigles and extract
     foreach (@sigle) {

       print "$_ ...\n";

       # TODO: Make this OS independent
       print '... ' . (

         # TODO:
         #   - prefix???
         $archive->extract_sigle([$_], $output, $jobs)
         ? '' : 'not '
       );
       print "extracted.\n";
     };
   }

   # Can't create archive object
   else {
     $log->error('Unable to extract from primary archive ' . $input[0]);
     exit 1;
   };
 }


 # Process an archive
 elsif ($cmd eq 'archive') {

   my $archive_output;

   # First extract, then archive
   if (defined $extract_dir && !-d $input[0]) {

     # Create new archive object
     if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {

       # Check zip capabilities
       unless ($archive->test_unzip) {
         $log->error("Unzip is not installed or incompatible.");
         exit 1;
       };

       # Add further annotation archived
       $archive->attach($_) foreach @input[1..$#input];

       # Create a temporary directory
       if ($extract_dir eq ':temp:') {
         $extract_dir = tempdir(CLEANUP => 0);
         print "Temporarily extract to $extract_dir\n";
       };

       # Add some random extra to avoid clashes with multiple archives
       $extract_dir = catdir($extract_dir, random_string('cccccc'));

       # Extract to temporary directory
       if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
         @input = ($extract_dir);
       }
       else {
         $log->error('Unable to extract from primary archive ' . $input[0] .
                       ' to ' . $extract_dir);
         exit 1;
       };
     }

     # Can't create archive object
     else {
       $log->error('Unable to extract from primary archive ' . $input[0]);
       exit 1;
     };
   };

   # Zero means: everything runs in the parent process
   my $pool = Parallel::ForkManager->new($jobs);

   my $count = 0;  # Texts to process
   my $iter  = 1;  # Current text in process

   my $tar_archive;
   my $output_dir = $output;
   my $tar_fh;

   # Initialize tar archive
   if ($to_tar) {
     $tar_archive = Archive::Tar::Builder->new(
       ignore_errors => 1
     );

     # Set output name
     my $tar_file = $output;
     unless ($tar_file =~ /\.tar$/) {
       $tar_file .= '.tar';
     };

     # Initiate the tar file
     print "Writing to file $tar_file\n";
     $tar_fh = IO::File->new($tar_file, 'w');
     $tar_fh->binmode(1);

     # Set handle
     $tar_archive->set_handle($tar_fh);

     # Output to temporary directory
     $output_dir = File::Temp->newdir;
   };

   # Report on fork message
   $pool->run_on_finish (
     sub {
       my ($pid, $code) = @_;
       my $data = pop;

       print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
         ($iter++) . "/$count]" .
         ($code ? " $code" : '') .
         ' ' . $data->[0] . "\n";

       if (!$code && $to_tar && $data->[2]) {
         my $filename = $data->[2];

         # Lock filehandle
         if (flock($tar_fh, LOCK_EX)) {

           my $clean_file = fileparse($filename);

           # Archive and remove file
           $tar_archive->archive_as($filename => $clean_file);
           unlink $filename;

           # Unlock filehandle
           flock($tar_fh, LOCK_UN);
         }
         else {
           $log->warn("Unable to add $filename to archive");
         };
       };

       $data->[1] = undef if $data->[1];
     }
   );

   my $t;
   my $temp;
   print "Reading data ...\n";

   #  unless (Cache::FastMmap->new(
   #    share_file => $cache_file,
   #    cache_size => $cache_size,
   #    init_file => $cache_init
   #  )) {
   #    print "Unable to intialize cache '$cache_file'\n\n";
   #    exit(1);
   #  };


   # Input is a directory
   if (-d $input[0]) {
     my $it = Directory::Iterator->new($input[0]);
     my @dirs;
     my $dir;

     # Todo: Make a DO WHILE
     while (1) {
       if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
         push @dirs, $dir;
         $it->prune;
       };
       last unless $it->next;
     };

     print "Start processing ...\n";
     $t = Benchmark->new;
     $count = scalar @dirs;

   DIRECTORY_LOOP:
     for (my $i = 0; $i < $count; $i++) {

       my $filename = catfile(
         $output_dir,
         get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
       );

       # Get the next fork
       $pool->start and next DIRECTORY_LOOP;

       if (my $return = $batch_file->process($dirs[$i] => $filename)) {
         $pool->finish(
           0,
           [
             "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
             undef,
             $filename
           ]
         );
       }
       else {
         $pool->finish(1, ["Unable to process " . $dirs[$i]]);
       };
     };
   }

   # Input is a file
   elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {

     unless ($archive->test_unzip) {
       $log->error("Unzip is not installed or incompatible.");
       exit 1;
     };

     # Add further annotation archived
     $archive->attach($_) foreach @input[1..$#input];

     # Get sigles to extract
     my $prefix = set_sigle($archive);

     print "Start processing ...\n";
     $t = Benchmark->new;
     my @dirs = $archive->list_texts;
     $count = scalar @dirs;

   ARCHIVE_LOOP:
     for (my $i = 0; $i < $count; $i++) {

       # Split path information
       my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);

       my $filename = catfile(
         $output_dir,
         get_file_name(
           $input[0],
           catfile($corpus, $doc, $text)
             . '.json' . ($gzip ? '.gz' : '')
           )
       );

       # Get the next fork
       $pool->start and next ARCHIVE_LOOP;

       # Create temporary file
       $temp = File::Temp->newdir;

       # TODO: Check if $filename exist at the beginning,
       # because extraction can be horrible slow!

       # Extract from archive
       if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {

         # Create corpus directory
         my $input = catdir("$temp", $corpus);

         # Temporary directory
         my $dir = catdir($input, $doc, $text);

         # Write file
         if (my $return = $batch_file->process($dir => $filename)) {

           # Delete temporary file
           $pool->finish(
             0,
             [
               "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
               $temp,
               $filename
             ]
           );
           #$pool->finish(0, ["Processed " . $filename, $temp]);
         }
         else {
           # Delete temporary file
           $pool->finish(1, ["Unable to process " . $dir, $temp]);
         };
       }

       # Unable to extract
       else {
         $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
       };
     };
   }

   else {
     print "Input is neither a directory nor an archive.\n\n";
   };

   $pool->wait_all_children;

   # Delete cache file
   unlink($cache_file) if $cache_delete;

   # Close tar filehandle
   if ($to_tar && $tar_fh) {
     $tar_archive->finish;
     $tar_fh->close;
     print "Wrote to tar archive.\n";
   };

   print timestr(timediff(Benchmark->new, $t))."\n";
   print "Done.\n";
 };


 # For an archive, this will create the list
 # of all sigles to process
 sub set_sigle {
   my $archive = shift;

   my $prefix = 1;
   my @dirs = ();

   # No sigles given
   unless (@sigle) {

     # Get files
     foreach ($archive->list_texts) {

       push @dirs, $_;

       # Split path information
       ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);

       # TODO: Make this OS independent
       push @sigle, join '/', $corpus, $doc, $text;
     };
   }

   # Check sigle for doc sigles
   else {
     my @new_sigle;

     my $prefix_check = 0;

     # Iterate over all sigle
     foreach (@sigle) {

       # Sigle is a doc sigle
       if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {

         print "$_ ...";
         # Check if a prefix is needed
         unless ($prefix_check) {

           if ($prefix = $archive->check_prefix) {
             print " with prefix ...";
           };
           $prefix_check = 1;
         };

         print "\n";

         print '... ' . (
           $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
             ? '' : 'not '
         );
         print "extracted.\n";
       }

       # Sigle is a text sigle
       else {
         push @new_sigle, $_;

         unless ($prefix_check) {

           if ($prefix = $archive->check_prefix) {
             print " with prefix ...";
           };
           $prefix_check = 1;
         };
       };
     };
     @sigle = @new_sigle;
   };

   return $prefix;
 };


 # Cleanup temporary extraction directory
 if ($extract_dir) {
   my $objects = remove_tree($extract_dir, { safe => 1 });
   print "Removed directory $extract_dir with $objects objects.\n";
 };


 print "\n";

 __END__

 =pod

 =encoding utf8

 =head1 NAME

 korapxml2krill - Merge KorAP-XML data and create Krill documents


 =head1 SYNOPSIS

   korapxml2krill [archive|extract] --input <directory|archive> [options]


 =head1 DESCRIPTION

 L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
 compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
 The C<korapxml2krill> command line tool is a simple wrapper of this library.


 =head1 INSTALLATION

 The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.

   $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git

 In case everything went well, the C<korapxml2krill> tool will
 be available on your command line immediately.
 Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
 In addition to work with zip archives, the C<unzip> tool needs to be present.

 =head1 ARGUMENTS

   $ korapxml2krill -z --input <directory> --output <filename>

 Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
 It expects the input to point to the text level folder.

 =over 2

 =item B<archive>

   $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>

 Converts an archive of KorAP-XML documents. It expects a directory
 (pointing to the corpus level folder) or one or more zip files as input.

 =item B<extract>

   $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>

 Extracts KorAP-XML documents from a zip file.

 =item B<serial>

   $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>

 Convert archives sequentially. The inputs are not merged but treated
 as they are (so they may be premerged or globs).
 the C<--out> directory is treated as the base directory where subdirectories
 are created based on the archive name. In case the C<--to-tar> flag is given,
 the output will be a tar file.


 =back


 =head1 OPTIONS

 =over 2

 =item B<--input|-i> <directory|zip file>

 Directory or zip file(s) of documents to convert.

 Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
 document, while C<archive> expects a KorAP-XML corpus folder or a zip
 file to batch process multiple files.
 C<extract> expects zip files only.

 C<archive> supports multiple input zip files with the constraint,
 that the first archive listed contains all primary data files
 and all meta data files.

   -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"

 Input may also be defined using BSD glob wildcards.

   -i 'file/news*.zip'

 The extended input array will be sorted in length order, so the shortest
 path needs to contain all primary data files and all meta data files.

 (The directory structure follows the base directory format,
 that may include a C<.> root folder.
 In this case further archives lacking a C<.> root folder
 need to be passed with a hash sign in front of the archive's name.
 This may require to quote the parameter.)

 To support zip files, a version of C<unzip> needs to be installed that is
 compatible with the archive file.

 B<The root folder switch using the hash sign is experimental and
 may vanish in future versions.>


 =item B<--input-base|-ib> <directory>

 The base directory for inputs.


 =item B<--output|-o> <directory|file>

 Output folder for archive processing or
 document name for single output (optional),
 writes to C<STDOUT> by default
 (in case C<output> is not mandatory due to further options).

 =item B<--overwrite|-w>

 Overwrite files that already exist.


 =item B<--token|-t> <foundry>#<file>

 Define the default tokenization by specifying
 the name of the foundry and optionally the name
 of the layer-file. Defaults to C<OpenNLP#tokens>.
 This will directly take the file instead of running
 the layer implementation!


 =item B<--base-sentences|-bs> <foundry>#<layer>

 Define the layer for base sentences.
 If given, this will be used instead of using C<Base#Sentences>.
 Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
 layers supported.

  Defaults to unset.


 =item B<--base-paragraphs|-bp> <foundry>#<layer>

 Define the layer for base paragraphs.
 If given, this will be used instead of using C<Base#Paragraphs>.
 Currently C<DeReKo#Structure> is the only additional layer supported.

  Defaults to unset.


 =item B<--base-pagebreaks|-bpb> <foundry>#<layer>

 Define the layer for base pagebreaks.
 Currently C<DeReKo#Structure> is the only layer supported.

  Defaults to unset.


 =item B<--skip|-s> <foundry>[#<layer>]

 Skip specific annotations by specifying the foundry
 (and optionally the layer with a C<#>-prefix),
 e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
 Can be set multiple times.


 =item B<--anno|-a> <foundry>#<layer>

 Convert specific annotations by specifying the foundry
 (and optionally the layer with a C<#>-prefix),
 e.g. C<Mate> or C<Mate#Morpho>.
 Can be set multiple times.


 =item B<--primary|-p>

 Output primary data or not. Defaults to C<true>.
 Can be flagged using C<--no-primary> as well.
 This is I<deprecated>.


 =item B<--non-word-tokens|-nwt>

 Tokenize non-word tokens like word tokens (defined as matching
 C</[\d\w]/>). Useful to treat punctuations as tokens.

  Defaults to unset.


 =item B<--non-verbal-tokens|-nvt>

 Tokenize non-verbal tokens marked as in the primary data as
 the unicode symbol 'Black Vertical Rectangle' aka \x25ae.

  Defaults to unset.


 =item B<--jobs|-j>

 Define the number of concurrent jobs in seperated forks
 for archive processing.
 Defaults to C<0> (everything runs in a single process).

 If C<sequential-extraction> is not set to false, this will
 also apply to extraction.

 Pass -1, and the value will be set automatically to 5
 times the number of available cores.
 This is I<experimental>.


 =item B<--koral|-k>

 Version of the output format. Supported versions are:
 C<0> for legacy serialization, C<0.03> for serialization
 with metadata fields as key-values on the root object,
 C<0.4> for serialization with metadata fields as a list
 of C<"@type":"koral:field"> objects.

 Currently defaults to C<0.03>.


 =item B<--sequential-extraction|-se>

 Flag to indicate, if the C<jobs> value also applies to extraction.
 Some systems may have problems with extracting multiple archives
 to the same folder at the same time.
 Can be flagged using C<--no-sequential-extraction> as well.
 Defaults to C<false>.


 =item B<--meta|-m>

 Define the metadata parser to use. Defaults to C<I5>.
 Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
 This is I<experimental>.


 =item B<--pretty|-y>

 Pretty print JSON output. Defaults to C<false>.
 This is I<deprecated>.


 =item B<--gzip|-z>

 Compress the output.
 Expects a defined C<output> file in single processing.


 =item B<--cache|-c>

 File to mmap a cache (using L<Cache::FastMmap>).
 Defaults to C<korapxml2krill.cache> in the calling directory.


 =item B<--cache-size|-cs>

 Size of the cache. Defaults to C<50m>.


 =item B<--cache-init|-ci>

 Initialize cache file.
 Can be flagged using C<--no-cache-init> as well.
 Defaults to C<true>.


 =item B<--cache-delete|-cd>

 Delete cache file after processing.
 Can be flagged using C<--no-cache-delete> as well.
 Defaults to C<true>.


 =item B<--config|-cfg>

 Configure the parameters of your call in a file
 of key-value pairs with whitespace separator

   overwrite 1
   token     DeReKo#Structure
   ...

 Supported parameters are:
 C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
 C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
 C<output>, C<koral>,
 C<tempary-extract>, C<sequential-extraction>,
 C<base-sentences>, C<base-paragraphs>,
 C<base-pagebreaks>,
 C<skip> (semicolon separated), C<sigle>
 (semicolon separated), C<anno> (semicolon separated).

 Configuration parameters will always be overwritten by
 passed parameters.


 =item B<--temporary-extract|-te>

 Only valid for the C<archive> command.

 This will first extract all files into a
 directory and then will archive.
 If the directory is given as C<:temp:>,
 a temporary directory is used.
 This is especially useful to avoid
 massive unzipping and potential
 network latency.


 =item B<--to-tar>

 Only valid for the C<archive> command.

 Writes the output into a tar archive.


 =item B<--sigle|-sg>

 Extract the given texts.
 Can be set multiple times.
 I<Currently only supported on C<extract>.>
 Sigles have the structure C<Corpus>/C<Document>/C<Text>.
 In case the C<Text> path is omitted, the whole document will be extracted.
 On the document level, the postfix wildcard C<*> is supported.


 =item B<--log|-l>

 The L<Log4perl> log level, defaults to C<ERROR>.


 =item B<--help|-h>

 Print help information.


 =item B<--version|-v>

 Print version information.

 =back


 =head1 ANNOTATION SUPPORT

 L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
 developed in the KorAP project that are part of the KorAP preprocessing pipeline.
 The base foundry with paragraphs, sentences, and the text element are mandatory for
 L<Krill|https://github.com/KorAP/Krill>.

   Base
     #Paragraphs
     #Sentences

   Connexor
     #Morpho
     #Phrase
     #Sentences
     #Syntax

   CoreNLP
     #Constituency
     #Morpho
     #NamedEntities
     #Sentences

   CMC
     #Morpho

   DeReKo
     #Structure

   DGD
     #Morpho
     #Structure

   DRuKoLa
     #Morpho

   Glemm
     #Morpho

   HNC
     #Morpho

   LWC
     #Dependency

   Malt
     #Dependency

   MarMoT
     #Morpho

   Mate
     #Dependency
     #Morpho

   MDParser
     #Dependency

   OpenNLP
     #Morpho
     #Sentences

   RWK
     #Morpho
     #Structure

   Sgbr
     #Lemma
     #Morpho

   Talismane
     #Dependency
     #Morpho

   TreeTagger
     #Morpho
     #Sentences

   XIP
     #Constituency
     #Morpho
     #Sentences


 More importers are in preparation.
 New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
 See the built-in annotation importers as examples.


 =head1 About KorAP-XML

 KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
 data model (Bański et al. 2013), where text data are stored physically
 separated from their interpretations (i.e. annotations).
 A text document in KorAP-XML therefore consists of several files
 containing primary data, metadata and annotations.

 The structure of a single KorAP-XML document can be as follows:

   - data.xml
   - header.xml
     + base
       - tokens.xml
       - ...
     + struct
       - structure.xml
       - ...
     + corenlp
       - morpho.xml
       - constituency.xml
       - ...
     + tree_tagger
       - morpho.xml
       - ...
     - ...

 The C<data.xml> contains the primary data, the C<header.xml> contains
 the metadata, and the annotation layers are stored in subfolders
 like C<base>, C<struct> or C<corenlp>
 (so-called "foundries"; Bański et al. 2013).

 Metadata is available in the TEI-P5 variant I5
 (Lüngen and Sperberg-McQueen 2012). See the documentation in
 L<KorAP::XML::Meta::I5> for translatable fields.

 Annotations correspond to a variant of the TEI-P5 feature structures
 (TEI Consortium; Lee et al. 2004).
 Annotation feature structures refer to character sequences of the primary text
 inside the C<text> element of the C<data.xml>.
 A single annotation containing the lemma of a token can have the following structure:

   <span from="0" to="3">
     <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
       <f name="lex">
         <fs>
           <f name="lemma">zum</f>
         </fs>
       </f>
     </fs>
   </span>

 The C<from> and C<to> attributes are refering to the character span
 in the primary text.
 Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
 the structure may vary. See L<KorAP::XML::Annotation::*> for various
 annotation preprocessors.

 Multiple KorAP-XML documents are organized on three levels following
 the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
 corpus E<gt> document E<gt> text. On each level metadata information
 can be stored, that C<korapxml2krill> will merge to a single metadata
 object per text. A corpus is therefore structured as follows:

   + <corpus>
     - header.xml
     + <document>
       - header.xml
       + <text>
         - data.xml
         - header.xml
         - ...
     - ...

 A single text can be identified by the concatenation of
 the corpus identifier, the document identifier and the text identifier.
 This identifier is called the text sigle
 (e.g. a text with the identifier C<18486> in the document C<060> in the
 corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).

 These corpora are often stored in zip files, with which C<korapxml2krill>
 can deal with. Corpora may also be split in multiple zip archives
 (e.g. one zip file per foundry), which is also supported (see C<--input>).

 Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
 in form of a test suite.
 The resulting JSON format merges all annotation layers
 based on a single token stream.

 =head2 References

 Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
 KorAP data model: first approximation, December.

 Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
 "The New IDS Corpus Analysis Platform: Challenges and Prospects",
 Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
 L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>

 Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
 "Robust corpus architecture: a new look at virtual collections and data access",
 Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
 L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>

 Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
 Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
 "Towards an international standard on featurestructure representation",
 Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
 pp. 373-376.
 L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>

 Harald Lüngen and C. M. Sperberg-McQueen (2012):
 "A TEI P5 Document Grammar for the IDS Text Model",
 Journal of the Text Encoding Initiative, Issue 3 | November 2012.
 L<PDF|https://journals.openedition.org/jtei/pdf/508>

 TEI Consortium, eds:
 "Feature Structures",
 Guidelines for Electronic Text Encoding and Interchange.
 L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>

 =head1 AVAILABILITY

   https://github.com/KorAP/KorAP-XML-Krill


 =head1 COPYRIGHT AND LICENSE

 Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>

 Author: L<Nils Diewald|https://nils-diewald.de/>

 Contributor: Eliza Margaretha

 L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
 member of the
 L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.

 This program is free software published under the
 L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.

 =cut