| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
 | 2 | use strict; | 
 | 3 | use warnings; | 
 | 4 | use FindBin; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 5 | use Getopt::Long; | 
 | 6 | use Directory::Iterator; | 
 | 7 |  | 
 | 8 | my $local = $FindBin::Bin; | 
 | 9 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 10 | # Changes | 
 | 11 | # 2013/11/25 | 
 | 12 | # - Initial release | 
 | 13 | # | 
 | 14 | # 2016/02/04 | 
 | 15 | # - Rename to korapxml2krill_dir | 
 | 16 |  | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 17 | sub printhelp { | 
 | 18 |   print <<'EOHELP'; | 
 | 19 |  | 
 | 20 | Merge foundry data based on a tokenization and create indexer friendly documents | 
 | 21 | for whole directories. | 
 | 22 |  | 
 | 23 | Call: | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 24 | korapxml2krill_dir -z --input <directory> --output <directory> | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 25 |  | 
 | 26 | --input|-i <directory>          Directory of documents to index | 
 | 27 | --output|-o <directory>         Name of output folder | 
 | 28 | --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying | 
 | 29 |                                 the name of the foundry and optionally the name | 
 | 30 |                                 of the layer. Defaults to OpenNLP#tokens. | 
 | 31 | --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name | 
 | 32 |                                 or specific layers by defining the name | 
 | 33 |                                 with a # in front of the foundry, | 
 | 34 |                                 e.g. Mate#Morpho. Alternatively you can skip #ALL. | 
 | 35 |                                 Can be set multiple times. | 
 | 36 | --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them | 
 | 37 |                                 combining the foundry name with a # and the layer name. | 
 | 38 | --primary|-p                    Output primary data or not. Defaults to true. | 
 | 39 |                                 Can be flagged using --no-primary as well. | 
 | 40 | --human|-m                      Represent the data human friendly, | 
 | 41 |                                 while the output defaults to JSON | 
 | 42 | --pretty|-y                     Pretty print json output | 
 | 43 | --gzip|-z                       Compress the output | 
 | 44 |                                 (expects a defined output file) | 
 | 45 | --log|-l                        The Log4perl log level, defaults to ERROR. | 
 | 46 | --help|-h                       Print this document (optional) | 
 | 47 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 48 | diewald@ids-mannheim.de, 2016/02/04 | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 49 |  | 
 | 50 | EOHELP | 
 | 51 |  | 
 | 52 |   exit(defined $_[0] ? $_[0] : 0); | 
 | 53 | }; | 
 | 54 |  | 
 | 55 | my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty); | 
 | 56 | GetOptions( | 
 | 57 |   'input|i=s'   => \$input, | 
 | 58 |   'output|o=s'  => \$output, | 
 | 59 |   'human|m'     => \$text, | 
 | 60 |   'token|t=s'   => \$token_base, | 
 | 61 |   'gzip|z'      => \$gzip, | 
 | 62 |   'skip|s=s'    => \@skip, | 
 | 63 |   'log|l=s'     => \$log_level, | 
 | 64 |   'allow|a=s'   => \@allow, | 
 | 65 |   'primary|p!'  => \$primary, | 
 | 66 |   'pretty|y'    => \$pretty, | 
 | 67 |   'help|h'      => sub { printhelp } | 
 | 68 | ); | 
 | 69 |  | 
 | 70 | printhelp(1) if !$input || !$output; | 
 | 71 |  | 
 | 72 |  | 
 | 73 | sub write_file { | 
 | 74 |   my $anno = shift; | 
 | 75 |   my $file = $anno; | 
 | 76 |   $file =~ s/^?\/?$input//; | 
 | 77 |   $file =~ tr/\//-/; | 
 | 78 |   $file =~ s{^-+}{}; | 
 | 79 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 80 |   my $call = 'perl ' . $local . '/korapxml2krill -i ' . $anno . ' -o ' . $output . '/' . $file . '.json'; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 81 |   $call .= '.gz -z' if $gzip; | 
 | 82 |   $call .= ' -m' if $text; | 
| Akron | 508c18e | 2016-02-07 23:54:15 +0100 | [diff] [blame] | 83 |   $call .= ' -t ' . $token_base if $token_base; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 84 |   $call .= ' -l ' . $log_level if $log_level; | 
 | 85 |   $call .= ' --no-primary ' if $primary; | 
 | 86 |   $call .= ' -y ' . $pretty if $pretty; | 
 | 87 |   $call .= ' -a ' . $_ foreach @allow; | 
 | 88 |   $call .= ' -s ' . $_ foreach @skip; | 
| Akron | 05df218 | 2016-02-12 16:28:39 +0100 | [diff] [blame^] | 89 |   print "$file "; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 90 |   system($call); | 
| Akron | 0fe59d7 | 2016-02-11 22:13:36 +0100 | [diff] [blame] | 91 |   print "\n"; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 92 | }; | 
 | 93 |  | 
 | 94 |  | 
 | 95 | my $it = Directory::Iterator->new($input); | 
| Akron | 0fe59d7 | 2016-02-11 22:13:36 +0100 | [diff] [blame] | 96 | my @dirs; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 97 | my $dir; | 
 | 98 | while (1) { | 
 | 99 |  | 
 | 100 |     if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) { | 
| Akron | 0fe59d7 | 2016-02-11 22:13:36 +0100 | [diff] [blame] | 101 | 	push @dirs, $dir; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 102 | 	$it->prune; | 
 | 103 |     }; | 
 | 104 |   last unless $it->next; | 
 | 105 | }; | 
 | 106 |  | 
| Akron | 0fe59d7 | 2016-02-11 22:13:36 +0100 | [diff] [blame] | 107 | my $count = scalar @dirs; | 
 | 108 | for (my $i = 0; $i < $count; $i++) { | 
| Akron | 05df218 | 2016-02-12 16:28:39 +0100 | [diff] [blame^] | 109 |   print 'Convert [' . ($i + 1) . "/$count] "; | 
| Akron | 0fe59d7 | 2016-02-11 22:13:36 +0100 | [diff] [blame] | 110 |   write_file($dirs[$i]); | 
 | 111 | }; | 
 | 112 |  | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 113 |  | 
 | 114 | __END__ |