blob: b955dcfda86bb5124bcc9f35e32f9a9edb3b806b [file] [log] [blame]
Nils Diewald092178e2013-11-26 16:18:48 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron150b29e2016-02-14 23:06:48 +01004use lib 'lib';
Nils Diewald092178e2013-11-26 16:18:48 +00005use FindBin;
Akron96165ad2016-02-15 18:09:41 +01006use File::Temp;
Akron150b29e2016-02-14 23:06:48 +01007use File::Spec::Functions qw/catfile catdir/;
Nils Diewald092178e2013-11-26 16:18:48 +00008use Getopt::Long;
9use Directory::Iterator;
Akron150b29e2016-02-14 23:06:48 +010010use KorAP::XML::Krill;
11use KorAP::XML::Archive;
Akron96165ad2016-02-15 18:09:41 +010012use Benchmark qw/:hireswallclock/;
13use Parallel::ForkManager;
Nils Diewald092178e2013-11-26 16:18:48 +000014
15my $local = $FindBin::Bin;
16
Akron93d620e2016-02-05 19:40:05 +010017# Changes
18# 2013/11/25
19# - Initial release
20#
21# 2016/02/04
22# - Rename to korapxml2krill_dir
Akron069bd712016-02-12 19:09:06 +010023#
24# 2016/02/12
25# - Support overwrite
Akron150b29e2016-02-14 23:06:48 +010026#
27# 2016/02/14
28# - Added version information
Akronc1babed2016-02-15 11:48:18 +010029# - Added support for archive files
30#
31# 2016/02/15
32# - Fixed temporary directory bug
33# - Improved skipping before unzipping
Akron96165ad2016-02-15 18:09:41 +010034# - Added EXPERIMENTAL concurrency support
Akron150b29e2016-02-14 23:06:48 +010035
36sub printversion {
37 print "Version " . $KorAP::XML::Krill::VERSION . "\n\n";
38 exit(1);
39};
Akron93d620e2016-02-05 19:40:05 +010040
Nils Diewald092178e2013-11-26 16:18:48 +000041sub printhelp {
42 print <<'EOHELP';
43
44Merge foundry data based on a tokenization and create indexer friendly documents
45for whole directories.
46
47Call:
Akron93d620e2016-02-05 19:40:05 +010048korapxml2krill_dir -z --input <directory> --output <directory>
Nils Diewald092178e2013-11-26 16:18:48 +000049
Akron150b29e2016-02-14 23:06:48 +010050 --input|-i <directory|file> Directory or archive file of documents to index
Akron069bd712016-02-12 19:09:06 +010051 --output|-o <directory> Name of output folder
52 --overwrite|-w Overwrite files that already exist
53 --token|-t <foundry>[#<layer>] Define the default tokenization by specifying
54 the name of the foundry and optionally the name
55 of the layer. Defaults to OpenNLP#tokens.
56 --skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
57 or specific layers by defining the name
58 with a # in front of the foundry,
59 e.g. Mate#Morpho. Alternatively you can skip #ALL.
60 Can be set multiple times.
61 --allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
62 combining the foundry name with a # and the layer name.
63 --primary|-p Output primary data or not. Defaults to true.
64 Can be flagged using --no-primary as well.
Akron96165ad2016-02-15 18:09:41 +010065 --jobs|-j Define the number of concurrent jobs in seperated forks,
66 defaults to 0. This is EXPERIMENTAL!
Akron069bd712016-02-12 19:09:06 +010067 --human|-m Represent the data human friendly,
68 while the output defaults to JSON
69 --pretty|-y Pretty print json output
70 --gzip|-z Compress the output
71 (expects a defined output file)
72 --log|-l The Log4perl log level, defaults to ERROR.
73 --help|-h Print this document (optional)
Akron150b29e2016-02-14 23:06:48 +010074 --version|-v Print version information
Nils Diewald092178e2013-11-26 16:18:48 +000075
Akronc1babed2016-02-15 11:48:18 +010076diewald@ids-mannheim.de, 2016/02/15
Nils Diewald092178e2013-11-26 16:18:48 +000077
78EOHELP
79
80 exit(defined $_[0] ? $_[0] : 0);
81};
82
Akron069bd712016-02-12 19:09:06 +010083my ($input, $output, $text, $gzip, $log_level, @skip,
Akron96165ad2016-02-15 18:09:41 +010084 $token_base, $primary, @allow, $pretty,
85 $overwrite);
86my $jobs = 0;
Nils Diewald092178e2013-11-26 16:18:48 +000087GetOptions(
88 'input|i=s' => \$input,
89 'output|o=s' => \$output,
90 'human|m' => \$text,
Akron069bd712016-02-12 19:09:06 +010091 'overwrite|w' => \$overwrite,
Nils Diewald092178e2013-11-26 16:18:48 +000092 'token|t=s' => \$token_base,
93 'gzip|z' => \$gzip,
94 'skip|s=s' => \@skip,
95 'log|l=s' => \$log_level,
96 'allow|a=s' => \@allow,
97 'primary|p!' => \$primary,
98 'pretty|y' => \$pretty,
Akron96165ad2016-02-15 18:09:41 +010099 'jobs|j=i' => \$jobs,
Akron150b29e2016-02-14 23:06:48 +0100100 'help|h' => sub { printhelp },
101 'version|v' => sub { printversion }
Nils Diewald092178e2013-11-26 16:18:48 +0000102);
103
104printhelp(1) if !$input || !$output;
105
Akronc1babed2016-02-15 11:48:18 +0100106sub get_file_name {
107 my $file = shift;
108 $file =~ s/^?\/?$input//;
109 $file =~ tr/\//-/;
110 $file =~ s{^-+}{};
111 return $file;
112};
Nils Diewald092178e2013-11-26 16:18:48 +0000113
Akron150b29e2016-02-14 23:06:48 +0100114# write file
Nils Diewald092178e2013-11-26 16:18:48 +0000115sub write_file {
116 my $anno = shift;
Akronc1babed2016-02-15 11:48:18 +0100117 my $file = get_file_name($anno);
Nils Diewald092178e2013-11-26 16:18:48 +0000118
Akron96165ad2016-02-15 18:09:41 +0100119 # TODO: This should be done directly with a data structure! KorAP::XML::Wrap
120
121 my $call = 'perl ' . $local . '/korapxml2krill -i ' .
122 $anno . ' -o ' . $output . '/' . $file . '.json';
Nils Diewald092178e2013-11-26 16:18:48 +0000123 $call .= '.gz -z' if $gzip;
124 $call .= ' -m' if $text;
Akron069bd712016-02-12 19:09:06 +0100125 $call .= ' -w' if $overwrite;
Akrona9d47722016-02-07 23:54:15 +0100126 $call .= ' -t ' . $token_base if $token_base;
Nils Diewald092178e2013-11-26 16:18:48 +0000127 $call .= ' -l ' . $log_level if $log_level;
128 $call .= ' --no-primary ' if $primary;
129 $call .= ' -y ' . $pretty if $pretty;
130 $call .= ' -a ' . $_ foreach @allow;
131 $call .= ' -s ' . $_ foreach @skip;
Nils Diewald092178e2013-11-26 16:18:48 +0000132 system($call);
Akron96165ad2016-02-15 18:09:41 +0100133 return "$file";
Nils Diewald092178e2013-11-26 16:18:48 +0000134};
135
Akron96165ad2016-02-15 18:09:41 +0100136# Zero means: everything runs in the parent process
137my $pool = Parallel::ForkManager->new($jobs);
138
139my $count = 0;
140my $iter = 0;
141
142# Report on fork message
143$pool->run_on_finish (
144 sub {
145 my ($pid, $code) = shift;
146 my $data = pop;
147 print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
148 ($iter++) . "/$count]" .
149 ($code ? " $code" : '') .
150 " $$data\n";
151 }
152);
153
154my $t;
155print "Reading data ...\n";
156
Akron150b29e2016-02-14 23:06:48 +0100157# Input is a directory
158if (-d $input) {
159 my $it = Directory::Iterator->new($input);
160 my @dirs;
161 my $dir;
Akron96165ad2016-02-15 18:09:41 +0100162
Akron150b29e2016-02-14 23:06:48 +0100163 while (1) {
Nils Diewald092178e2013-11-26 16:18:48 +0000164 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron150b29e2016-02-14 23:06:48 +0100165 push @dirs, $dir;
166 $it->prune;
Nils Diewald092178e2013-11-26 16:18:48 +0000167 };
Akron150b29e2016-02-14 23:06:48 +0100168 last unless $it->next;
169 };
Nils Diewald092178e2013-11-26 16:18:48 +0000170
Akron96165ad2016-02-15 18:09:41 +0100171 print "Start processing ...\n";
172 $t = Benchmark->new;
173 $count = scalar @dirs;
174
175 DIRECTORY_LOOP:
Akron150b29e2016-02-14 23:06:48 +0100176 for (my $i = 0; $i < $count; $i++) {
Akron96165ad2016-02-15 18:09:41 +0100177
178 unless ($overwrite) {
179 my $filename = catfile(
180 $output,
181 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
182 );
183
184 if (-e $filename) {
185 $iter++;
186 print "Skip $filename\n";
187 next;
188 };
189 };
190
191 # Get the next fork
192 my $pid = $pool->start and next DIRECTORY_LOOP;
193 my $msg;
194
195 $msg = write_file($dirs[$i]);
196 $pool->finish(0, \$msg);
Akron150b29e2016-02-14 23:06:48 +0100197 };
198}
199
200# Input is a file
201elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
202 unless ($archive->test_unzip) {
203 print "Unzip is not installed or incompatible.\n\n";
204 exit(1);
205 };
206
207 unless ($archive->test) {
208 print "Zip archive not compatible.\n\n";
209 exit(1);
210 };
211
Akron96165ad2016-02-15 18:09:41 +0100212 print "Start processing ...\n";
213 $t = Benchmark->new;
Akron150b29e2016-02-14 23:06:48 +0100214 my @dirs = $archive->list_texts;
Akron96165ad2016-02-15 18:09:41 +0100215 $count = scalar @dirs;
216
217 ARCHIVE_LOOP:
Akron150b29e2016-02-14 23:06:48 +0100218 for (my $i = 0; $i < $count; $i++) {
Akron150b29e2016-02-14 23:06:48 +0100219
220 # Split path information
221 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
222
Akronc1babed2016-02-15 11:48:18 +0100223 unless ($overwrite) {
Akronc1babed2016-02-15 11:48:18 +0100224 my $filename = catfile(
225 $output,
226 get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
227 );
Akron96165ad2016-02-15 18:09:41 +0100228
Akronc1babed2016-02-15 11:48:18 +0100229 if (-e $filename) {
Akron96165ad2016-02-15 18:09:41 +0100230 $iter++;
Akronc1babed2016-02-15 11:48:18 +0100231 print "Skip $filename\n";
232 next;
233 };
234 };
235
Akron96165ad2016-02-15 18:09:41 +0100236 # Get the next fork
237 my $pid = $pool->start and next ARCHIVE_LOOP;
238
Akron150b29e2016-02-14 23:06:48 +0100239 # Create temporary file
Akronc1babed2016-02-15 11:48:18 +0100240 my $temp = File::Temp->newdir;
Akron150b29e2016-02-14 23:06:48 +0100241
Akron96165ad2016-02-15 18:09:41 +0100242 my $msg;
243
Akron150b29e2016-02-14 23:06:48 +0100244 # Extract from archive
245 if ($archive->extract($dirs[$i], $temp)) {
246
247 # Create corpus directory
Akronc1babed2016-02-15 11:48:18 +0100248 $input = catdir("$temp", $corpus);
Akron150b29e2016-02-14 23:06:48 +0100249
250 # Temporary directory
251 my $dir = catdir($input, $doc, $text);
252
253 # Write file
Akron96165ad2016-02-15 18:09:41 +0100254 $msg = write_file($dir);
255
256 $temp = undef;
257 $pool->finish(0, \$msg);
Akron150b29e2016-02-14 23:06:48 +0100258 }
259 else {
Akron150b29e2016-02-14 23:06:48 +0100260
Akron96165ad2016-02-15 18:09:41 +0100261 $temp = undef;
262 $msg = "Unable to extract " . $dirs[$i] . "\n";
263 $pool->finish(1, \$msg);
264 };
Akron150b29e2016-02-14 23:06:48 +0100265 };
266}
267
268else {
269 print "Input is neither a directory nor an archive.\n\n";
Akronfd0707e2016-02-11 22:13:36 +0100270};
271
Akron96165ad2016-02-15 18:09:41 +0100272$pool->wait_all_children;
273
274print timestr(timediff(Benchmark->new, $t))."\n\n";
Nils Diewald092178e2013-11-26 16:18:48 +0000275
276__END__