| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
|  | 2 | use strict; | 
|  | 3 | use warnings; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 4 | use lib 'lib'; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 5 | use FindBin; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 6 | use File::Temp qw/tempdir/; | 
|  | 7 | use File::Spec::Functions qw/catfile catdir/; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 8 | use Getopt::Long; | 
|  | 9 | use Directory::Iterator; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 10 | use KorAP::XML::Krill; | 
|  | 11 | use KorAP::XML::Archive; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 12 |  | 
|  | 13 | my $local = $FindBin::Bin; | 
|  | 14 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 15 | # Changes | 
|  | 16 | # 2013/11/25 | 
|  | 17 | # - Initial release | 
|  | 18 | # | 
|  | 19 | # 2016/02/04 | 
|  | 20 | # - Rename to korapxml2krill_dir | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 21 | # | 
|  | 22 | # 2016/02/12 | 
|  | 23 | # - Support overwrite | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 24 | # | 
|  | 25 | # 2016/02/14 | 
|  | 26 | # - Added version information | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame^] | 27 | # - Added support for archive files | 
|  | 28 | # | 
|  | 29 | # 2016/02/15 | 
|  | 30 | # - Fixed temporary directory bug | 
|  | 31 | # - Improved skipping before unzipping | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 32 |  | 
|  | 33 | sub printversion { | 
|  | 34 | print "Version " . $KorAP::XML::Krill::VERSION . "\n\n"; | 
|  | 35 | exit(1); | 
|  | 36 | }; | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 37 |  | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 38 | sub printhelp { | 
|  | 39 | print <<'EOHELP'; | 
|  | 40 |  | 
|  | 41 | Merge foundry data based on a tokenization and create indexer friendly documents | 
|  | 42 | for whole directories. | 
|  | 43 |  | 
|  | 44 | Call: | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 45 | korapxml2krill_dir -z --input <directory> --output <directory> | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 46 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 47 | --input|-i <directory|file>     Directory or archive file of documents to index | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 48 | --output|-o <directory>         Name of output folder | 
|  | 49 | --overwrite|-w                  Overwrite files that already exist | 
|  | 50 | --token|-t <foundry>[#<layer>]  Define the default tokenization by specifying | 
|  | 51 | the name of the foundry and optionally the name | 
|  | 52 | of the layer. Defaults to OpenNLP#tokens. | 
|  | 53 | --skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name | 
|  | 54 | or specific layers by defining the name | 
|  | 55 | with a # in front of the foundry, | 
|  | 56 | e.g. Mate#Morpho. Alternatively you can skip #ALL. | 
|  | 57 | Can be set multiple times. | 
|  | 58 | --allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them | 
|  | 59 | combining the foundry name with a # and the layer name. | 
|  | 60 | --primary|-p                    Output primary data or not. Defaults to true. | 
|  | 61 | Can be flagged using --no-primary as well. | 
|  | 62 | --human|-m                      Represent the data human friendly, | 
|  | 63 | while the output defaults to JSON | 
|  | 64 | --pretty|-y                     Pretty print json output | 
|  | 65 | --gzip|-z                       Compress the output | 
|  | 66 | (expects a defined output file) | 
|  | 67 | --log|-l                        The Log4perl log level, defaults to ERROR. | 
|  | 68 | --help|-h                       Print this document (optional) | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 69 | --version|-v                    Print version information | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 70 |  | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame^] | 71 | diewald@ids-mannheim.de, 2016/02/15 | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 72 |  | 
|  | 73 | EOHELP | 
|  | 74 |  | 
|  | 75 | exit(defined $_[0] ? $_[0] : 0); | 
|  | 76 | }; | 
|  | 77 |  | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 78 | my ($input, $output, $text, $gzip, $log_level, @skip, | 
|  | 79 | $token_base, $primary, @allow, $pretty, $overwrite); | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 80 | GetOptions( | 
|  | 81 | 'input|i=s'   => \$input, | 
|  | 82 | 'output|o=s'  => \$output, | 
|  | 83 | 'human|m'     => \$text, | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 84 | 'overwrite|w' => \$overwrite, | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 85 | 'token|t=s'   => \$token_base, | 
|  | 86 | 'gzip|z'      => \$gzip, | 
|  | 87 | 'skip|s=s'    => \@skip, | 
|  | 88 | 'log|l=s'     => \$log_level, | 
|  | 89 | 'allow|a=s'   => \@allow, | 
|  | 90 | 'primary|p!'  => \$primary, | 
|  | 91 | 'pretty|y'    => \$pretty, | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 92 | 'help|h'      => sub { printhelp }, | 
|  | 93 | 'version|v'   => sub { printversion } | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 94 | ); | 
|  | 95 |  | 
|  | 96 | printhelp(1) if !$input || !$output; | 
|  | 97 |  | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame^] | 98 | sub get_file_name { | 
|  | 99 | my $file = shift; | 
|  | 100 | $file =~ s/^?\/?$input//; | 
|  | 101 | $file =~ tr/\//-/; | 
|  | 102 | $file =~ s{^-+}{}; | 
|  | 103 | return $file; | 
|  | 104 | }; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 105 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 106 | # write file | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 107 | sub write_file { | 
|  | 108 | my $anno = shift; | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame^] | 109 | my $file = get_file_name($anno); | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 110 |  | 
| Akron | 9a04c71 | 2016-02-05 19:40:05 +0100 | [diff] [blame] | 111 | my $call = 'perl ' . $local . '/korapxml2krill -i ' . $anno . ' -o ' . $output . '/' . $file . '.json'; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 112 | $call .= '.gz -z' if $gzip; | 
|  | 113 | $call .= ' -m' if $text; | 
| Akron | 9078bb9 | 2016-02-12 19:09:06 +0100 | [diff] [blame] | 114 | $call .= ' -w' if $overwrite; | 
| Akron | 508c18e | 2016-02-07 23:54:15 +0100 | [diff] [blame] | 115 | $call .= ' -t ' . $token_base if $token_base; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 116 | $call .= ' -l ' . $log_level if $log_level; | 
|  | 117 | $call .= ' --no-primary ' if $primary; | 
|  | 118 | $call .= ' -y ' . $pretty if $pretty; | 
|  | 119 | $call .= ' -a ' . $_ foreach @allow; | 
|  | 120 | $call .= ' -s ' . $_ foreach @skip; | 
| Akron | 05df218 | 2016-02-12 16:28:39 +0100 | [diff] [blame] | 121 | print "$file "; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 122 | system($call); | 
| Akron | 0fe59d7 | 2016-02-11 22:13:36 +0100 | [diff] [blame] | 123 | print "\n"; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 124 | }; | 
|  | 125 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 126 | # Input is a directory | 
|  | 127 | if (-d $input) { | 
|  | 128 | my $it = Directory::Iterator->new($input); | 
|  | 129 | my @dirs; | 
|  | 130 | my $dir; | 
|  | 131 | while (1) { | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 132 | if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) { | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 133 | push @dirs, $dir; | 
|  | 134 | $it->prune; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 135 | }; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 136 | last unless $it->next; | 
|  | 137 | }; | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 138 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 139 | my $count = scalar @dirs; | 
|  | 140 | for (my $i = 0; $i < $count; $i++) { | 
|  | 141 | print 'Convert [' . ($i + 1) . "/$count] "; | 
|  | 142 | write_file($dirs[$i]); | 
|  | 143 | }; | 
|  | 144 | } | 
|  | 145 |  | 
|  | 146 | # Input is a file | 
|  | 147 | elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) { | 
|  | 148 | unless ($archive->test_unzip) { | 
|  | 149 | print "Unzip is not installed or incompatible.\n\n"; | 
|  | 150 | exit(1); | 
|  | 151 | }; | 
|  | 152 |  | 
|  | 153 | unless ($archive->test) { | 
|  | 154 | print "Zip archive not compatible.\n\n"; | 
|  | 155 | exit(1); | 
|  | 156 | }; | 
|  | 157 |  | 
|  | 158 | my @dirs = $archive->list_texts; | 
|  | 159 | my $count = scalar @dirs; | 
|  | 160 | for (my $i = 0; $i < $count; $i++) { | 
|  | 161 | print 'Convert [' . ($i + 1) . "/$count] "; | 
|  | 162 |  | 
|  | 163 | # Split path information | 
|  | 164 | my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]); | 
|  | 165 |  | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame^] | 166 | unless ($overwrite) { | 
|  | 167 |  | 
|  | 168 | my $filename = catfile( | 
|  | 169 | $output, | 
|  | 170 | get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '') | 
|  | 171 | ); | 
|  | 172 | if (-e $filename) { | 
|  | 173 | print "Skip $filename\n"; | 
|  | 174 | next; | 
|  | 175 | }; | 
|  | 176 | }; | 
|  | 177 |  | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 178 | # Create temporary file | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame^] | 179 | my $temp = File::Temp->newdir; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 180 |  | 
|  | 181 | # Extract from archive | 
|  | 182 | if ($archive->extract($dirs[$i], $temp)) { | 
|  | 183 |  | 
|  | 184 | # Create corpus directory | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame^] | 185 | $input = catdir("$temp", $corpus); | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 186 |  | 
|  | 187 | # Temporary directory | 
|  | 188 | my $dir = catdir($input, $doc, $text); | 
|  | 189 |  | 
|  | 190 | # Write file | 
|  | 191 | write_file($dir); | 
|  | 192 | } | 
|  | 193 | else { | 
|  | 194 | print "Unable to extract " . $dirs[$i] . "\n"; | 
|  | 195 | }; | 
|  | 196 |  | 
| Akron | a3b80da | 2016-02-15 11:48:18 +0100 | [diff] [blame^] | 197 | $temp = undef; | 
| Akron | dba4771 | 2016-02-14 23:06:48 +0100 | [diff] [blame] | 198 | }; | 
|  | 199 | } | 
|  | 200 |  | 
|  | 201 | else { | 
|  | 202 | print "Input is neither a directory nor an archive.\n\n"; | 
| Akron | 0fe59d7 | 2016-02-11 22:13:36 +0100 | [diff] [blame] | 203 | }; | 
|  | 204 |  | 
| Nils Diewald | 092178e | 2013-11-26 16:18:48 +0000 | [diff] [blame] | 205 |  | 
|  | 206 | __END__ |