| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
|  | 2 | use strict; | 
|  | 3 | use warnings; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 4 | use lib 'lib'; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 5 | use FindBin; | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 6 | use File::Temp; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 7 | use File::Spec::Functions qw/catfile catdir/; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 8 | use Getopt::Long; | 
|  | 9 | use Directory::Iterator; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 10 | use KorAP::XML::Krill; | 
|  | 11 | use KorAP::XML::Archive; | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 12 | use Benchmark qw/:hireswallclock/; | 
|  | 13 | use Parallel::ForkManager; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 14 |  | 
|  | 15 | my $local = $FindBin::Bin; | 
|  | 16 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 17 | # Changes | 
|  | 18 | # 2013/11/25 | 
|  | 19 | # - Initial release | 
|  | 20 | # | 
|  | 21 | # 2016/02/04 | 
|  | 22 | # - Rename to korapxml2krill_dir | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 23 | # | 
|  | 24 | # 2016/02/12 | 
|  | 25 | # - Support overwrite | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 26 | # | 
|  | 27 | # 2016/02/14 | 
|  | 28 | # - Added version information | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 29 | # - Added support for archive files | 
|  | 30 | # | 
|  | 31 | # 2016/02/15 | 
|  | 32 | # - Fixed temporary directory bug | 
|  | 33 | # - Improved skipping before unzipping | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 34 | # - Added EXPERIMENTAL concurrency support | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 35 |  | 
|  | 36 | sub printversion { | 
|  | 37 | print "Version " . $KorAP::XML::Krill::VERSION . "\n\n"; | 
|  | 38 | exit(1); | 
|  | 39 | }; | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 40 |  | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 41 | sub printhelp { | 
|  | 42 | print <<'EOHELP'; | 
|  | 43 |  | 
|  | 44 | Merge foundry data based on a tokenization and create indexer friendly documents | 
|  | 45 | for whole directories. | 
|  | 46 |  | 
|  | 47 | Call: | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 48 | korapxml2krill_dir -z --input <directory> --output <directory> | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 49 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 50 | --input|-i <directory|file>     Directory or archive file of documents to index | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 51 | --output|-o <directory>         Name of output folder | 
|  | 52 | --overwrite|-w                  Overwrite files that already exist | 
|  | 53 | --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying | 
|  | 54 | the name of the foundry and optionally the name | 
|  | 55 | of the layer. Defaults to OpenNLP#tokens. | 
|  | 56 | --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name | 
|  | 57 | or specific layers by defining the name | 
|  | 58 | with a # in front of the foundry, | 
|  | 59 | e.g. Mate#Morpho. Alternatively you can skip #ALL. | 
|  | 60 | Can be set multiple times. | 
|  | 61 | --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them | 
|  | 62 | combining the foundry name with a # and the layer name. | 
|  | 63 | --primary|-p                    Output primary data or not. Defaults to true. | 
|  | 64 | Can be flagged using --no-primary as well. | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 65 | --jobs|-j                       Define the number of concurrent jobs in seperated forks, | 
|  | 66 | defaults to 0. This is EXPERIMENTAL! | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 67 | --human|-m                      Represent the data human friendly, | 
|  | 68 | while the output defaults to JSON | 
|  | 69 | --pretty|-y                     Pretty print json output | 
|  | 70 | --gzip|-z                       Compress the output | 
|  | 71 | (expects a defined output file) | 
|  | 72 | --log|-l                        The Log4perl log level, defaults to ERROR. | 
|  | 73 | --help|-h                       Print this document (optional) | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 74 | --version|-v                    Print version information | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 75 |  | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 76 | diewald@ids-mannheim.de, 2016/02/15 | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 77 |  | 
|  | 78 | EOHELP | 
|  | 79 |  | 
|  | 80 | exit(defined $_[0] ? $_[0] : 0); | 
|  | 81 | }; | 
|  | 82 |  | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 83 | my ($input, $output, $text, $gzip, $log_level, @skip, | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 84 | $token_base, $primary, @allow, $pretty, | 
|  | 85 | $overwrite); | 
|  | 86 | my $jobs = 0; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 87 | GetOptions( | 
|  | 88 | 'input|i=s'   => \$input, | 
|  | 89 | 'output|o=s'  => \$output, | 
|  | 90 | 'human|m'     => \$text, | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 91 | 'overwrite|w' => \$overwrite, | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 92 | 'token|t=s'   => \$token_base, | 
|  | 93 | 'gzip|z'      => \$gzip, | 
|  | 94 | 'skip|s=s'    => \@skip, | 
|  | 95 | 'log|l=s'     => \$log_level, | 
|  | 96 | 'allow|a=s'   => \@allow, | 
|  | 97 | 'primary|p!'  => \$primary, | 
|  | 98 | 'pretty|y'    => \$pretty, | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 99 | 'jobs|j=i'    => \$jobs, | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 100 | 'help|h'      => sub { printhelp }, | 
|  | 101 | 'version|v'   => sub { printversion } | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 102 | ); | 
|  | 103 |  | 
|  | 104 | printhelp(1) if !$input || !$output; | 
|  | 105 |  | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 106 | sub get_file_name { | 
|  | 107 | my $file = shift; | 
|  | 108 | $file =~ s/^?\/?$input//; | 
|  | 109 | $file =~ tr/\//-/; | 
|  | 110 | $file =~ s{^-+}{}; | 
|  | 111 | return $file; | 
|  | 112 | }; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 113 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 114 | # write file | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 115 | sub write_file { | 
|  | 116 | my $anno = shift; | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 117 | my $file = get_file_name($anno); | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 118 |  | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 119 | # TODO: This should be done directly with a data structure! KorAP::XML::Wrap | 
|  | 120 |  | 
|  | 121 | my $call = 'perl ' . $local . '/korapxml2krill -i ' . | 
|  | 122 | $anno . ' -o ' . $output . '/' . $file . '.json'; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 123 | $call .= '.gz -z' if $gzip; | 
|  | 124 | $call .= ' -m' if $text; | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 125 | $call .= ' -w' if $overwrite; | 
| Akron | 508c18e | 2016-02-07 23:54:15 +0100 | [diff] [blame] | 126 | $call .= ' -t ' . $token_base if $token_base; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 127 | $call .= ' -l ' . $log_level if $log_level; | 
|  | 128 | $call .= ' --no-primary ' if $primary; | 
|  | 129 | $call .= ' -y ' . $pretty if $pretty; | 
|  | 130 | $call .= ' -a ' . $_ foreach @allow; | 
|  | 131 | $call .= ' -s ' . $_ foreach @skip; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 132 | system($call); | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 133 | return "$file"; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 134 | }; | 
|  | 135 |  | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 136 | # Zero means: everything runs in the parent process | 
|  | 137 | my $pool = Parallel::ForkManager->new($jobs); | 
|  | 138 |  | 
|  | 139 | my $count = 0; | 
|  | 140 | my $iter = 0; | 
|  | 141 |  | 
|  | 142 | # Report on fork message | 
|  | 143 | $pool->run_on_finish ( | 
|  | 144 | sub { | 
|  | 145 | my ($pid, $code) = shift; | 
|  | 146 | my $data = pop; | 
|  | 147 | print 'Convert ['. ($jobs > 0 ? "$pid:" : '') . | 
|  | 148 | ($iter++) . "/$count]" . | 
|  | 149 | ($code ? " $code" : '') . | 
|  | 150 | " $$data\n"; | 
|  | 151 | } | 
|  | 152 | ); | 
|  | 153 |  | 
|  | 154 | my $t; | 
|  | 155 | print "Reading data ...\n"; | 
|  | 156 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 157 | # Input is a directory | 
|  | 158 | if (-d $input) { | 
|  | 159 | my $it = Directory::Iterator->new($input); | 
|  | 160 | my @dirs; | 
|  | 161 | my $dir; | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 162 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 163 | while (1) { | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 164 | if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) { | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 165 | push @dirs, $dir; | 
|  | 166 | $it->prune; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 167 | }; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 168 | last unless $it->next; | 
|  | 169 | }; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 170 |  | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 171 | print "Start processing ...\n"; | 
|  | 172 | $t = Benchmark->new; | 
|  | 173 | $count = scalar @dirs; | 
|  | 174 |  | 
|  | 175 | DIRECTORY_LOOP: | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 176 | for (my $i = 0; $i < $count; $i++) { | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 177 |  | 
|  | 178 | unless ($overwrite) { | 
|  | 179 | my $filename = catfile( | 
|  | 180 | $output, | 
|  | 181 | get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '') | 
|  | 182 | ); | 
|  | 183 |  | 
|  | 184 | if (-e $filename) { | 
|  | 185 | $iter++; | 
|  | 186 | print "Skip $filename\n"; | 
|  | 187 | next; | 
|  | 188 | }; | 
|  | 189 | }; | 
|  | 190 |  | 
|  | 191 | # Get the next fork | 
|  | 192 | my $pid = $pool->start and next DIRECTORY_LOOP; | 
|  | 193 | my $msg; | 
|  | 194 |  | 
|  | 195 | $msg = write_file($dirs[$i]); | 
|  | 196 | $pool->finish(0, \$msg); | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 197 | }; | 
|  | 198 | } | 
|  | 199 |  | 
|  | 200 | # Input is a file | 
|  | 201 | elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) { | 
|  | 202 | unless ($archive->test_unzip) { | 
|  | 203 | print "Unzip is not installed or incompatible.\n\n"; | 
|  | 204 | exit(1); | 
|  | 205 | }; | 
|  | 206 |  | 
|  | 207 | unless ($archive->test) { | 
|  | 208 | print "Zip archive not compatible.\n\n"; | 
|  | 209 | exit(1); | 
|  | 210 | }; | 
|  | 211 |  | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 212 | print "Start processing ...\n"; | 
|  | 213 | $t = Benchmark->new; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 214 | my @dirs = $archive->list_texts; | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 215 | $count = scalar @dirs; | 
|  | 216 |  | 
|  | 217 | ARCHIVE_LOOP: | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 218 | for (my $i = 0; $i < $count; $i++) { | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 219 |  | 
|  | 220 | # Split path information | 
|  | 221 | my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]); | 
|  | 222 |  | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 223 | unless ($overwrite) { | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 224 | my $filename = catfile( | 
|  | 225 | $output, | 
|  | 226 | get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '') | 
|  | 227 | ); | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 228 |  | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 229 | if (-e $filename) { | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 230 | $iter++; | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 231 | print "Skip $filename\n"; | 
|  | 232 | next; | 
|  | 233 | }; | 
|  | 234 | }; | 
|  | 235 |  | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 236 | # Get the next fork | 
|  | 237 | my $pid = $pool->start and next ARCHIVE_LOOP; | 
|  | 238 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 239 | # Create temporary file | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 240 | my $temp = File::Temp->newdir; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 241 |  | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 242 | my $msg; | 
|  | 243 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 244 | # Extract from archive | 
|  | 245 | if ($archive->extract($dirs[$i], $temp)) { | 
|  | 246 |  | 
|  | 247 | # Create corpus directory | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame] | 248 | $input = catdir("$temp", $corpus); | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 249 |  | 
|  | 250 | # Temporary directory | 
|  | 251 | my $dir = catdir($input, $doc, $text); | 
|  | 252 |  | 
|  | 253 | # Write file | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 254 | $msg = write_file($dir); | 
|  | 255 |  | 
|  | 256 | $temp = undef; | 
|  | 257 | $pool->finish(0, \$msg); | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 258 | } | 
|  | 259 | else { | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 260 |  | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 261 | $temp = undef; | 
|  | 262 | $msg = "Unable to extract " . $dirs[$i] . "\n"; | 
|  | 263 | $pool->finish(1, \$msg); | 
|  | 264 | }; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 265 | }; | 
|  | 266 | } | 
|  | 267 |  | 
|  | 268 | else { | 
|  | 269 | print "Input is neither a directory nor an archive.\n\n"; | 
| Akron | 0fe59d7 | 2016-02-11 22:13:36 +0100 | [diff] [blame] | 270 | }; | 
|  | 271 |  | 
| Akron | 16b40db | 2016-02-15 18:09:41 +0100 | [diff] [blame^] | 272 | $pool->wait_all_children; | 
|  | 273 |  | 
|  | 274 | print timestr(timediff(Benchmark->new, $t))."\n\n"; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 275 |  | 
|  | 276 | __END__ |