| #!/usr/bin/env perl | 
 | use strict; | 
 | use warnings; | 
 | use lib 'lib'; | 
 | use FindBin; | 
 | use File::Temp qw/tempdir/; | 
 | use File::Spec::Functions qw/catfile catdir/; | 
 | use Getopt::Long; | 
 | use Directory::Iterator; | 
 | use KorAP::XML::Krill; | 
 | use KorAP::XML::Archive; | 
 |  | 
 | my $local = $FindBin::Bin; | 
 |  | 
 | # Changes | 
 | # 2013/11/25 | 
 | # - Initial release | 
 | # | 
 | # 2016/02/04 | 
 | # - Rename to korapxml2krill_dir | 
 | # | 
 | # 2016/02/12 | 
 | # - Support overwrite | 
 | # | 
 | # 2016/02/14 | 
 | # - Added version information | 
 | # - Added support for archive files | 
 | # | 
 | # 2016/02/15 | 
 | # - Fixed temporary directory bug | 
 | # - Improved skipping before unzipping | 
 |  | 
 | sub printversion { | 
 |   print "Version " . $KorAP::XML::Krill::VERSION . "\n\n"; | 
 |   exit(1); | 
 | }; | 
 |  | 
 | sub printhelp { | 
 |   print <<'EOHELP'; | 
 |  | 
 | Merge foundry data based on a tokenization and create indexer friendly documents | 
 | for whole directories. | 
 |  | 
 | Call: | 
 | korapxml2krill_dir -z --input <directory> --output <directory> | 
 |  | 
 |   --input|-i <directory|file>     Directory or archive file of documents to index | 
 |   --output|-o <directory>         Name of output folder | 
 |   --overwrite|-w                  Overwrite files that already exist | 
 |   --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying | 
 |                                   the name of the foundry and optionally the name | 
 |                                   of the layer. Defaults to OpenNLP#tokens. | 
 |   --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name | 
 |                                   or specific layers by defining the name | 
 |                                   with a # in front of the foundry, | 
 |                                   e.g. Mate#Morpho. Alternatively you can skip #ALL. | 
 |                                   Can be set multiple times. | 
 |   --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them | 
 |                                   combining the foundry name with a # and the layer name. | 
 |   --primary|-p                    Output primary data or not. Defaults to true. | 
 |                                   Can be flagged using --no-primary as well. | 
 |   --human|-m                      Represent the data human friendly, | 
 |                                   while the output defaults to JSON | 
 |   --pretty|-y                     Pretty print json output | 
 |   --gzip|-z                       Compress the output | 
 |                                   (expects a defined output file) | 
 |   --log|-l                        The Log4perl log level, defaults to ERROR. | 
 |   --help|-h                       Print this document (optional) | 
 |   --version|-v                    Print version information | 
 |  | 
 | diewald@ids-mannheim.de, 2016/02/15 | 
 |  | 
 | EOHELP | 
 |  | 
 |   exit(defined $_[0] ? $_[0] : 0); | 
 | }; | 
 |  | 
 | my ($input, $output, $text, $gzip, $log_level, @skip, | 
 |     $token_base, $primary, @allow, $pretty, $overwrite); | 
 | GetOptions( | 
 |   'input|i=s'   => \$input, | 
 |   'output|o=s'  => \$output, | 
 |   'human|m'     => \$text, | 
 |   'overwrite|w' => \$overwrite, | 
 |   'token|t=s'   => \$token_base, | 
 |   'gzip|z'      => \$gzip, | 
 |   'skip|s=s'    => \@skip, | 
 |   'log|l=s'     => \$log_level, | 
 |   'allow|a=s'   => \@allow, | 
 |   'primary|p!'  => \$primary, | 
 |   'pretty|y'    => \$pretty, | 
 |   'help|h'      => sub { printhelp }, | 
 |   'version|v'   => sub { printversion } | 
 | ); | 
 |  | 
 | printhelp(1) if !$input || !$output; | 
 |  | 
 | sub get_file_name { | 
 |   my $file = shift; | 
 |   $file =~ s/^?\/?$input//; | 
 |   $file =~ tr/\//-/; | 
 |   $file =~ s{^-+}{}; | 
 |   return $file; | 
 | }; | 
 |  | 
 | # write file | 
 | sub write_file { | 
 |   my $anno = shift; | 
 |   my $file = get_file_name($anno); | 
 |  | 
 |   my $call = 'perl ' . $local . '/korapxml2krill -i ' . $anno . ' -o ' . $output . '/' . $file . '.json'; | 
 |   $call .= '.gz -z' if $gzip; | 
 |   $call .= ' -m' if $text; | 
 |   $call .= ' -w' if $overwrite; | 
 |   $call .= ' -t ' . $token_base if $token_base; | 
 |   $call .= ' -l ' . $log_level if $log_level; | 
 |   $call .= ' --no-primary ' if $primary; | 
 |   $call .= ' -y ' . $pretty if $pretty; | 
 |   $call .= ' -a ' . $_ foreach @allow; | 
 |   $call .= ' -s ' . $_ foreach @skip; | 
 |   print "$file "; | 
 |   system($call); | 
 |   print "\n"; | 
 | }; | 
 |  | 
 | # Input is a directory | 
 | if (-d $input) { | 
 |   my $it = Directory::Iterator->new($input); | 
 |   my @dirs; | 
 |   my $dir; | 
 |   while (1) { | 
 |     if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) { | 
 |       push @dirs, $dir; | 
 |       $it->prune; | 
 |     }; | 
 |     last unless $it->next; | 
 |   }; | 
 |  | 
 |   my $count = scalar @dirs; | 
 |   for (my $i = 0; $i < $count; $i++) { | 
 |     print 'Convert [' . ($i + 1) . "/$count] "; | 
 |     write_file($dirs[$i]); | 
 |   }; | 
 | } | 
 |  | 
 | # Input is a file | 
 | elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) { | 
 |   unless ($archive->test_unzip) { | 
 |     print "Unzip is not installed or incompatible.\n\n"; | 
 |     exit(1); | 
 |   }; | 
 |  | 
 |   unless ($archive->test) { | 
 |     print "Zip archive not compatible.\n\n"; | 
 |     exit(1); | 
 |   }; | 
 |  | 
 |   my @dirs = $archive->list_texts; | 
 |   my $count = scalar @dirs; | 
 |   for (my $i = 0; $i < $count; $i++) { | 
 |     print 'Convert [' . ($i + 1) . "/$count] "; | 
 |  | 
 |     # Split path information | 
 |     my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]); | 
 |  | 
 |     unless ($overwrite) { | 
 |  | 
 |       my $filename = catfile( | 
 | 	$output, | 
 | 	get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '') | 
 |       ); | 
 |       if (-e $filename) { | 
 | 	print "Skip $filename\n"; | 
 | 	next; | 
 |       }; | 
 |     }; | 
 |  | 
 |     # Create temporary file | 
 |     my $temp = File::Temp->newdir; | 
 |  | 
 |     # Extract from archive | 
 |     if ($archive->extract($dirs[$i], $temp)) { | 
 |  | 
 |       # Create corpus directory | 
 |       $input = catdir("$temp", $corpus); | 
 |  | 
 |       # Temporary directory | 
 |       my $dir = catdir($input, $doc, $text); | 
 |  | 
 |       # Write file | 
 |       write_file($dir); | 
 |     } | 
 |     else { | 
 |       print "Unable to extract " . $dirs[$i] . "\n"; | 
 |     }; | 
 |  | 
 |     $temp = undef; | 
 |   }; | 
 | } | 
 |  | 
 | else { | 
 |   print "Input is neither a directory nor an archive.\n\n"; | 
 | }; | 
 |  | 
 |  | 
 | __END__ |