blob: 93e5eacec66b8748ecfa0d5dd981668f5209c1af [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
26
Akronc11f7982017-02-21 21:20:14 +010027
28# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010029# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010030# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010031
Akronc11f7982017-02-21 21:20:14 +010032# TODO: Use KorAP::XML::ForkPool!
33
Akron941c1a62016-02-23 17:41:41 +010034# CHANGES:
35# ----------------------------------------------------------
36# 2013/11/25
37# - Initial release
38#
39# 2014/10/29
40# - Merges foundry data to create indexer friendly documents
41#
Akron93d620e2016-02-05 19:40:05 +010042# 2016/02/04
43# - renamed to korapxml2krill
44# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010045#
46# 2016/02/12
47# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010048# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010049#
50# 2016/02/14
51# - Added version information
Akron941c1a62016-02-23 17:41:41 +010052# - Added support for archive files
53#
54# 2016/02/15
55# - Fixed temporary directory bug
56# - Improved skipping before unzipping
57# - Added EXPERIMENTAL concurrency support
58#
59# 2016/02/23
60# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010061#
62# 2016/02/27
63# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010064#
65# 2016/03/17
66# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010067#
68# 2016/03/18
69# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020070#
Akronf3f0c942016-06-27 13:27:14 +020071# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020072# - Added multi archive support
73# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020074# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020075#
76# 2016/07/06
77# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020078#
79# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020080# - Fixed temporary path issue in script
81#
82# 2016/10/24
83# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020084#
Akronb4bbec72016-10-26 20:21:02 +020085# 2016/10/24
86# - Added support for document extraction
87#
Akron3741f8b2016-12-21 19:55:21 +010088# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020089# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020090#
Akron3741f8b2016-12-21 19:55:21 +010091# 2016/12/21
92# - added support for base-sentences and base-tokenizations
93#
Akron4fa37c32017-01-20 14:43:10 +010094# 2017/01/20
95# - added support for DRuKoLa annotations
96#
Akron41ac10b2017-02-08 22:47:25 +010097# 2017/02/08
98# - added support for pagebreak annotations
99#
Akron821db3d2017-04-06 21:19:31 +0200100# 2017/04/06
101# - added support for wildcards in input
102#
Akron636aa112017-04-07 18:48:56 +0200103# 2017/04/07
104# - support configuration option
Akron81500102017-04-07 20:45:44 +0200105# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200106#
Akron941c1a62016-02-23 17:41:41 +0100107# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100108
Akron636aa112017-04-07 18:48:56 +0200109our $LAST_CHANGE = '2017/04/07';
Akron941c1a62016-02-23 17:41:41 +0100110our $LOCAL = $FindBin::Bin;
111our $VERSION_MSG = <<"VERSION";
112Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
113VERSION
114
Akron941c1a62016-02-23 17:41:41 +0100115# Parse comand
116my $cmd;
117our @ARGV;
118if ($ARGV[0] && index($ARGV[0], '-') != 0) {
119 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100120};
Akron93d620e2016-02-05 19:40:05 +0100121
Akron5f51d422016-08-16 16:26:43 +0200122my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100123my $text;
Akrone10ad322016-02-27 10:54:26 +0100124
Akron941c1a62016-02-23 17:41:41 +0100125# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000126GetOptions(
Akron08385f62016-03-22 20:37:04 +0100127 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100128 'output|o=s' => \(my $output),
129 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100130 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200131 'token|t=s' => \(my $token_base),
132 'base-sentences|bs=s' => \(my $base_sentences),
133 'base-paragraphs|bp=s' => \(my $base_paragraphs),
134 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100135 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200136 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100137 'skip|s=s' => \@skip,
138 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200139 'cache|c=s' => \(my $cache_file),
140 'config|cfg=s' => \(my $cfg_file),
141 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200142 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100143 'primary|p!' => \(my $primary),
144 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200145 'jobs|j=i' => \(my $jobs),
146 'cache-size|cs=s' => \(my $cache_size),
147 'cache-delete|cd!' => \(my $cache_delete),
148 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100149 'help|h' => sub {
150 pod2usage(
151 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200152 -verbose => 99,
153 -msg => $VERSION_MSG,
154 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100155 );
156 },
157 'version|v' => sub {
158 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200159 -verbose => 0,
160 -msg => $VERSION_MSG,
161 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100162 )
163 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000164);
165
Akron636aa112017-04-07 18:48:56 +0200166# Load from configuration
167if ($cfg_file && -e $cfg_file) {
168
169 print "Reading config from $cfg_file\n";
170
171 my %config;
172
173 Config::Simple->import_from($cfg_file, \%config);
174
175 # Overwrite
176 if (!defined($overwrite) && defined $config{overwrite}) {
177 $overwrite = $config{overwrite};
178 };
179
180 # Gzip
181 if (!defined($gzip) && defined $config{gzip}) {
182 $gzip = $config{gzip};
183 };
184
185 # Jobs
186 if (!defined($jobs) && defined $config{jobs}) {
187 $jobs = $config{jobs};
188 };
189
Akron81500102017-04-07 20:45:44 +0200190 # temporary-extract
191 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
192 $extract_dir = $config{'temporary-extract'};
193 };
194
Akron636aa112017-04-07 18:48:56 +0200195 # Token base
196 if (!defined($token_base) && defined $config{token}) {
197 $token_base = $config{token};
198 };
199
200 # Cache file
201 if (!defined($cache_file) && defined $config{cache}) {
202 $cache_file = $config{cache};
203 };
204
205 # Cache size
206 if (!defined($cache_size) && defined $config{'cache-size'}) {
207 $cache_size = $config{'cache-size'};
208 };
209
210 # Cache delete
211 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
212 $cache_delete = $config{'cache-delete'} ;
213 };
214
215 # Cache init
216 if (!(defined $cache_init) && defined $config{'cache-init'}) {
217 $cache_init = $config{'cache-init'} ;
218 };
219
220 # Meta
221 if (!(defined $meta) && defined $config{'meta'}) {
222 $meta = $config{'meta'} ;
223 };
224
225 # Output
226 if (!(defined $output) && defined $config{'output'}) {
227 $output = $config{'output'} ;
228 };
229
230 # Base-sentences
231 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
232 $base_sentences = $config{'base-sentences'} ;
233 };
234
235 # Base-paragraphs
236 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
237 $base_paragraphs = $config{'base-paragraphs'} ;
238 };
239
240 # Base-pagebreaks
241 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
242 $base_pagebreaks = $config{'base-pagebreaks'} ;
243 };
244
245 # Log
246 if (!(defined $log_level) && defined $config{'log'}) {
247 $log_level = $config{'log'} ;
248 };
249
250 # Skip
251 if (!scalar(@skip) && defined $config{'skip'}) {
252 @skip = split /\s*;\s*/, $config{'skip'} ;
253 };
254
255 # Sigle
256 if (!scalar(@sigle) && defined $config{'sigle'}) {
257 @sigle = split /\s*;\s*/, $config{'sigle'} ;
258 };
259
260 # Anno
261 if (!scalar(@anno) && defined $config{'anno'}) {
262 @anno = split /\s*;\s*/, $config{'anno'} ;
263 };
264};
265
266# Set default token base
267$token_base //= 'OpenNLP#tokens';
268$cache_file //= 'korapxml2krill.cache';
269$cache_size //= '50m';
270$jobs //= 0;
271$cache_delete //= 1;
272$cache_init //= 1;
273$log_level //= 'ERROR';
274$base_sentences //= '';
275$base_paragraphs //= '';
276$base_pagebreaks //= '';
277
Akron821db3d2017-04-06 21:19:31 +0200278$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100279$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100280$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100281
Akron941c1a62016-02-23 17:41:41 +0100282my %ERROR_HASH = (
283 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200284 -verbose => 99,
285 -msg => $VERSION_MSG,
286 -output => '-',
287 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100288);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000289
Akron941c1a62016-02-23 17:41:41 +0100290# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100291pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000292
Akrone1dbc382016-07-08 22:24:52 +0200293# Gzip has no effect, if no output is given
294pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000295
Akron941c1a62016-02-23 17:41:41 +0100296# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000297Log::Log4perl->init({
298 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
299 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
300 'log4perl.appender.STDERR.layout' => 'PatternLayout',
301 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
302});
303
304my $log = Log::Log4perl->get_logger('main');
305
Akronc11f7982017-02-21 21:20:14 +0100306
Akron636aa112017-04-07 18:48:56 +0200307if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100308 state $cores = Sys::Info->new->device('CPU')->count;
309 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200310 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100311};
312
Akron821db3d2017-04-06 21:19:31 +0200313
Akrone1dbc382016-07-08 22:24:52 +0200314my %skip;
315$skip{lc($_)} = 1 foreach @skip;
316
317my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100318push(@layers, ['Base', 'Sentences']) unless $base_sentences;
319push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200320
321# Connexor
322push(@layers, ['Connexor', 'Morpho']);
323push(@layers, ['Connexor', 'Syntax']);
324push(@layers, ['Connexor', 'Phrase']);
325push(@layers, ['Connexor', 'Sentences']);
326
327# CoreNLP
328push(@layers, ['CoreNLP', 'NamedEntities']);
329push(@layers, ['CoreNLP', 'Sentences']);
330push(@layers, ['CoreNLP', 'Morpho']);
331push(@layers, ['CoreNLP', 'Constituency']);
332
Akron3741f8b2016-12-21 19:55:21 +0100333
Akrone1dbc382016-07-08 22:24:52 +0200334# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100335my @dereko_attr = ();
336if ($base_sentences eq 'dereko#structure') {
337 push @dereko_attr, 'sentences';
338};
339if ($base_paragraphs eq 'dereko#structure') {
340 push @dereko_attr, 'paragraphs';
341};
Akron636bd9c2017-02-09 17:13:00 +0100342
Akron41ac10b2017-02-08 22:47:25 +0100343if ($base_pagebreaks eq 'dereko#structure') {
344 push @dereko_attr, 'pagebreaks';
345};
346
347if ($dereko_attr[0]) {
348 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100349}
350else {
351 push(@layers, ['DeReKo', 'Structure']);
352};
Akrone1dbc382016-07-08 22:24:52 +0200353
354# Glemm
355push(@layers, ['Glemm', 'Morpho']);
356
357# Malt
358push(@layers, ['Malt', 'Dependency']);
359
360# MDParser
361push(@layers, ['MDParser', 'Dependency']);
362
363# Mate
364push(@layers, ['Mate', 'Morpho']);
365push(@layers, ['Mate', 'Dependency']);
366
367# OpenNLP
368push(@layers, ['OpenNLP', 'Morpho']);
369push(@layers, ['OpenNLP', 'Sentences']);
370
371# Schreibgebrauch
372push(@layers, ['Sgbr', 'Lemma']);
373push(@layers, ['Sgbr', 'Morpho']);
374
375# TreeTagger
376push(@layers, ['TreeTagger', 'Morpho']);
377push(@layers, ['TreeTagger', 'Sentences']);
378
379# XIP
380push(@layers, ['XIP', 'Morpho']);
381push(@layers, ['XIP', 'Constituency']);
382push(@layers, ['XIP', 'Sentences']);
383push(@layers, ['XIP', 'Dependency']);
384
Akron4fa37c32017-01-20 14:43:10 +0100385# DRuKoLa
386push(@layers, ['DRuKoLa', 'Morpho']);
387
Akron3bd942f2017-02-20 20:09:14 +0100388# Marmot
389push(@layers, ['MarMoT', 'Morpho']);
390
Akron4fa37c32017-01-20 14:43:10 +0100391
Akrone1dbc382016-07-08 22:24:52 +0200392# Check filters
393my @filtered_anno;
394if ($skip{'#all'}) {
395 foreach (@anno) {
396 push @filtered_anno, [ split('#', $_) ];
397 };
398}
399
400# Add all annotations that are not skipped
401else {
402 # Add to index file - respect skipping
403 foreach my $info (@layers) {
404 # Skip if Foundry or Foundry#Layer should be skipped
405 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
406 push @filtered_anno, $info;
407 };
408 };
409};
410
411# Get tokenization basis
412my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
413
414# TODO: This should not be initialized for batch
415my $cache = Cache::FastMmap->new(
416 share_file => $cache_file,
417 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200418 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200419);
420
Akron03b24db2016-08-16 20:54:32 +0200421# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200422my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200423 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200424 meta_type => $meta,
425 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200426 foundry => $token_base_foundry,
427 layer => $token_base_layer,
428 gzip => $gzip,
429 log => $log,
430 primary => $primary,
431 pretty => $pretty,
432 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200433);
434
Akron941c1a62016-02-23 17:41:41 +0100435
436# Get file name based on path information
437sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100438 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200439 if (-d $i) {
440 $i =~ s![^\/]+$!!;
441 };
Akron941c1a62016-02-23 17:41:41 +0100442 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200443
444 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200445 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100446 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100447 $file =~ tr/\//-/;
448 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200449 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100450 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000451};
452
Akrone10ad322016-02-27 10:54:26 +0100453# Convert sigle to path construct
454s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
455
Akron7d4cdd82016-08-17 21:39:45 +0200456if ($cmd) {
457 if ($output && (!-e $output || !-d $output)) {
458 print "Directory '$output' does not exist.\n\n";
459 exit(0);
460 };
461};
462
Akron821db3d2017-04-06 21:19:31 +0200463# Glob files
464if (@input) {
465 my @new_input = ();
466
467 # Iterate over all inputs
468 foreach (@input) {
469 push (@new_input, bsd_glob($_));
470 };
471
472 if (scalar(@new_input) > scalar(@input)) {
473 @input = sort { length($a) <=> length($b) } @new_input;
Akron636aa112017-04-07 18:48:56 +0200474 print 'Input rewritten to ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200475 };
476};
477
478
Akron941c1a62016-02-23 17:41:41 +0100479# Process a single file
480unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100481 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000482
Akron941c1a62016-02-23 17:41:41 +0100483 BEGIN {
484 $main::TIME = Benchmark->new;
485 $main::LAST_STOP = Benchmark->new;
486 };
487
488 sub stop_time {
489 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200490 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100491 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200492 timestr(timediff($new, $main::LAST_STOP)) .
493 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
494 );
Akron941c1a62016-02-23 17:41:41 +0100495 $main::LAST_STOP = $new;
496 };
497
498 # Create and parse new document
499 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100500
Akron7d4cdd82016-08-17 21:39:45 +0200501 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200502 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100503
Akron11c80302016-03-18 19:44:43 +0100504 # Delete cache file
505 unlink($cache_file) if $cache_delete;
506
Akron5f51d422016-08-16 16:26:43 +0200507 stop_time;
Akron81500102017-04-07 20:45:44 +0200508 exit(1);
509};
510
Nils Diewald59094f22014-11-05 18:20:50 +0000511
Akrone10ad322016-02-27 10:54:26 +0100512# Extract XML files
Akron81500102017-04-07 20:45:44 +0200513if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100514
Akron7d4cdd82016-08-17 21:39:45 +0200515 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200516 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100517
Akron7d4cdd82016-08-17 21:39:45 +0200518 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100519 unless ($archive->test_unzip) {
520 print "Unzip is not installed or incompatible.\n\n";
Akron81500102017-04-07 20:45:44 +0200521 exit(0);
Akrone10ad322016-02-27 10:54:26 +0100522 };
523
Akronb0c88db2016-06-29 16:33:18 +0200524 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200525 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200526
Akron651cb8d2016-08-16 21:44:49 +0200527 my $prefix = 1;
528
Akron03b24db2016-08-16 20:54:32 +0200529 # No sigles given
530 unless (@sigle) {
531
532 # Get files
533 foreach ($archive->list_texts) {
534
535 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200536 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200537
538 # TODO: Make this OS independent
539 push @sigle, join '/', $corpus, $doc, $text;
540 };
Akron20807582016-10-26 17:11:34 +0200541 }
542
543 # Check sigle for doc sigles
544 else {
545 my @new_sigle;
546
547 my $prefix_check = 0;
548
549 # Iterate over all sigle
550 foreach (@sigle) {
551
552 # Sigle is a doc sigle
553 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200554
Akron60a8caa2017-02-17 21:51:27 +0100555 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200556 # Check if a prefix is needed
557 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100558
559 if ($prefix = $archive->check_prefix) {
560 print " with prefix ...";
561 };
Akron20807582016-10-26 17:11:34 +0200562 $prefix_check = 1;
563 };
564
Akron60a8caa2017-02-17 21:51:27 +0100565 print "\n";
566
Akron20807582016-10-26 17:11:34 +0200567 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200568 my $path = ($prefix ? './' : '') . $_;
569
570 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200571 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200572 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200573 ) ? '' : 'not '
574 );
575 print "extracted.\n";
576 }
Akron60a8caa2017-02-17 21:51:27 +0100577
578 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200579 else {
580 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100581
582 unless ($prefix_check) {
583
584 if ($prefix = $archive->check_prefix) {
585 print " with prefix ...";
586 };
587 $prefix_check = 1;
588 };
Akron20807582016-10-26 17:11:34 +0200589 };
590 };
591 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200592 };
593
Akrone10ad322016-02-27 10:54:26 +0100594 # Iterate over all given sigles and extract
595 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100596
Akron2812ba22016-10-28 21:55:59 +0200597 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200598
Akron03b24db2016-08-16 20:54:32 +0200599 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200600 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100601
Akron20807582016-10-26 17:11:34 +0200602 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200603 ($prefix ? './' : '') . $_, $output
604 ) ? '' : 'not '
605 );
Akrone10ad322016-02-27 10:54:26 +0100606 print "extracted.\n";
607 };
608
609 print "\n";
Akron81500102017-04-07 20:45:44 +0200610 # exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200611 }
Akron7d4cdd82016-08-17 21:39:45 +0200612
613 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200614 else {
615 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron81500102017-04-07 20:45:44 +0200616 exit(1);
Akrone10ad322016-02-27 10:54:26 +0100617 };
618}
619
Akron81500102017-04-07 20:45:44 +0200620
Akron941c1a62016-02-23 17:41:41 +0100621# Process an archive
622elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000623
Akron81500102017-04-07 20:45:44 +0200624 my $archive_output;
625
626 # First extract, then archive
627 if (defined $extract_dir) {
628
629 # Create new archive object
630 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
631
632 # Check zip capabilities
633 unless ($archive->test_unzip) {
634 print "Unzip is not installed or incompatible.\n\n";
635 exit(0);
636 };
637
638 # Add further annotation archived
639 $archive->attach($_) foreach @input[1..$#input];
640
641 # Create a temporary directory
642 if ($extract_dir eq ':temp:') {
643 $extract_dir = tempdir(CLEANUP => 1);
644 };
645
646 if ($archive->extract_all($extract_dir, $jobs)) {
647 @input = ($extract_dir);
648 }
649 else {
650 $log->error('Unable to extract from primary archive ' . $input[0] .
651 ' to ' . $extract_dir);
652 exit(1);
653 };
654 }
655
656 # Can't create archive object
657 else {
658 $log->error('Unable to extract from primary archive ' . $input[0]);
659 exit(1);
660 };
661 };
662
Akrone1dbc382016-07-08 22:24:52 +0200663 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100664
Akron7d4cdd82016-08-17 21:39:45 +0200665 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100666 my $pool = Parallel::ForkManager->new($jobs);
667
Akron7d4cdd82016-08-17 21:39:45 +0200668 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100669 my $iter = 1; # Current text in process
670
671 # Report on fork message
672 $pool->run_on_finish (
673 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200674 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100675 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200676
Akron08385f62016-03-22 20:37:04 +0100677 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200678 ($iter++) . "/$count]" .
679 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200680 ' ' . $data->[0] . "\n";
681 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100682 }
683 );
684
685 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200686 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100687 print "Reading data ...\n";
688
Akron7d4cdd82016-08-17 21:39:45 +0200689 # unless (Cache::FastMmap->new(
690 # share_file => $cache_file,
691 # cache_size => $cache_size,
692 # init_file => $cache_init
693 # )) {
694 # print "Unable to intialize cache '$cache_file'\n\n";
695 # exit(1);
696 # };
Akron11c80302016-03-18 19:44:43 +0100697
Akron941c1a62016-02-23 17:41:41 +0100698 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100699 if (-d $input[0]) {
700 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100701 my @dirs;
702 my $dir;
703
Akron7d4cdd82016-08-17 21:39:45 +0200704 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100705 while (1) {
706 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200707 push @dirs, $dir;
708 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100709 };
710 last unless $it->next;
711 };
712
713 print "Start processing ...\n";
714 $t = Benchmark->new;
715 $count = scalar @dirs;
716
717 DIRECTORY_LOOP:
718 for (my $i = 0; $i < $count; $i++) {
719
Akrone1dbc382016-07-08 22:24:52 +0200720 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200721 $output,
722 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200723 );
Akron941c1a62016-02-23 17:41:41 +0100724
725 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200726 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200727
Akron13d56622016-10-31 14:54:49 +0100728 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
729 $pool->finish(
730 0,
731 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
732 );
Akron3ec48972016-08-17 23:24:52 +0200733 }
734 else {
Akron4c0cf312016-10-15 16:42:09 +0200735 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200736 };
Akron941c1a62016-02-23 17:41:41 +0100737 };
738 }
739
740 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200741 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200742
Akron941c1a62016-02-23 17:41:41 +0100743 unless ($archive->test_unzip) {
744 print "Unzip is not installed or incompatible.\n\n";
745 exit(1);
746 };
747
Akron08385f62016-03-22 20:37:04 +0100748 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200749 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100750
Akron941c1a62016-02-23 17:41:41 +0100751 print "Start processing ...\n";
752 $t = Benchmark->new;
753 my @dirs = $archive->list_texts;
754 $count = scalar @dirs;
755
756 ARCHIVE_LOOP:
757 for (my $i = 0; $i < $count; $i++) {
758
759 # Split path information
760 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
761
Akrone1dbc382016-07-08 22:24:52 +0200762 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200763 $output,
764 get_file_name(
765 catfile($corpus, $doc, $text)
766 . '.json' . ($gzip ? '.gz' : '')
767 )
Akrone1dbc382016-07-08 22:24:52 +0200768 );
Akron941c1a62016-02-23 17:41:41 +0100769
770 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200771 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100772
Akron4c0cf312016-10-15 16:42:09 +0200773 # Create temporary file
774 $temp = File::Temp->newdir;
775
Akronbdf434a2016-10-24 17:42:07 +0200776 # TODO: Check if $filename exist at the beginning,
777 # because extraction can be horrible slow!
778
Akron941c1a62016-02-23 17:41:41 +0100779 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200780 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100781
Akron7d4cdd82016-08-17 21:39:45 +0200782 # Create corpus directory
783 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100784
Akron7d4cdd82016-08-17 21:39:45 +0200785 # Temporary directory
786 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100787
Akron7d4cdd82016-08-17 21:39:45 +0200788 # Write file
Akron13d56622016-10-31 14:54:49 +0100789 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200790 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100791 $pool->finish(
792 0,
793 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
794 );
795 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200796 }
797 else {
Akron4c0cf312016-10-15 16:42:09 +0200798 # Delete temporary file
799 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200800 };
Akron941c1a62016-02-23 17:41:41 +0100801 }
Akron7d4cdd82016-08-17 21:39:45 +0200802
803 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100804 else {
Akron4c0cf312016-10-15 16:42:09 +0200805 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100806 };
807 };
808 }
809
810 else {
811 print "Input is neither a directory nor an archive.\n\n";
812 };
813
814 $pool->wait_all_children;
815
Akron11c80302016-03-18 19:44:43 +0100816 # Delete cache file
817 unlink($cache_file) if $cache_delete;
818
Akron941c1a62016-02-23 17:41:41 +0100819 print "Done.\n";
820 print timestr(timediff(Benchmark->new, $t))."\n\n";
Akron81500102017-04-07 20:45:44 +0200821};
Akron941c1a62016-02-23 17:41:41 +0100822
Nils Diewald2db9ad02013-10-29 19:26:43 +0000823
824__END__
Akron941c1a62016-02-23 17:41:41 +0100825
826=pod
827
828=encoding utf8
829
830=head1 NAME
831
Akronf7ad89e2016-03-16 18:22:47 +0100832korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100833
834
835=head1 SYNOPSIS
836
Akrona76d8352016-10-27 16:27:32 +0200837 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100838
Akron2fd402b2016-10-27 21:26:48 +0200839
Akron941c1a62016-02-23 17:41:41 +0100840=head1 DESCRIPTION
841
842L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
843compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100844The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100845
846
847=head1 INSTALLATION
848
849The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
850
Akronaf386982016-10-12 00:33:25 +0200851 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100852
Akronc13a1702016-03-15 19:33:14 +0100853In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100854be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200855Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200856In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100857
858=head1 ARGUMENTS
859
Akrona76d8352016-10-27 16:27:32 +0200860 $ korapxml2krill -z --input <directory> --output <filename>
861
862Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200863It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200864
Akron941c1a62016-02-23 17:41:41 +0100865=over 2
866
867=item B<archive>
868
Akrona76d8352016-10-27 16:27:32 +0200869 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
870
Akron2fd402b2016-10-27 21:26:48 +0200871Converts an archive of KorAP-XML documents. It expects a directory
872(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100873
874=item B<extract>
875
Akrona76d8352016-10-27 16:27:32 +0200876 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
877
878Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100879
880=back
881
882
883=head1 OPTIONS
884
885=over 2
886
Akrona76d8352016-10-27 16:27:32 +0200887=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100888
Akrona76d8352016-10-27 16:27:32 +0200889Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100890
Akron7606afa2016-10-25 16:23:49 +0200891Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100892document, while C<archive> expects a KorAP-XML corpus folder or a zip
893file to batch process multiple files.
894C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200895
Akrona76d8352016-10-27 16:27:32 +0200896C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200897that the first archive listed contains all primary data files
898and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200899
Akron7606afa2016-10-25 16:23:49 +0200900 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200901
Akron821db3d2017-04-06 21:19:31 +0200902Input may also be defined using BSD glob wildcards.
903
904 -i 'file/news*.zip'
905
906The extended input array will be sorted in length order, so the shortest
907path needs to contain all primary data files and all meta data files.
908
Akron0c3e3752016-06-28 15:55:53 +0200909(The directory structure follows the base directory format,
910that may include a C<.> root folder.
911In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200912need to be passed with a hash sign in front of the archive's name.
913This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200914
Akron7606afa2016-10-25 16:23:49 +0200915To support zip files, a version of C<unzip> needs to be installed that is
916compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200917
Akron7606afa2016-10-25 16:23:49 +0200918B<The root folder switch using the hash sign is experimental and
919may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200920
Akron941c1a62016-02-23 17:41:41 +0100921=item B<--output|-o> <directory|file>
922
923Output folder for archive processing or
924document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100925writes to C<STDOUT> by default
926(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100927
928=item B<--overwrite|-w>
929
930Overwrite files that already exist.
931
Akron3741f8b2016-12-21 19:55:21 +0100932=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100933
934Define the default tokenization by specifying
935the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100936of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100937
Akron3741f8b2016-12-21 19:55:21 +0100938
939=item B<--base-sentences|-bs> <foundry>#<layer>
940
941Define the layer for base sentences.
942If given, this will be used instead of using C<Base#Sentences>.
943Currently C<DeReKo#Structure> is the only additional layer supported.
944
945 Defaults to unset.
946
947
948=item B<--base-paragraphs|-bp> <foundry>#<layer>
949
950Define the layer for base paragraphs.
951If given, this will be used instead of using C<Base#Paragraphs>.
952Currently C<DeReKo#Structure> is the only additional layer supported.
953
954 Defaults to unset.
955
956
Akron41ac10b2017-02-08 22:47:25 +0100957=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
958
959Define the layer for base pagebreaks.
960Currently C<DeReKo#Structure> is the only layer supported.
961
962 Defaults to unset.
963
964
Akron941c1a62016-02-23 17:41:41 +0100965=item B<--skip|-s> <foundry>[#<layer>]
966
Akronf7ad89e2016-03-16 18:22:47 +0100967Skip specific annotations by specifying the foundry
968(and optionally the layer with a C<#>-prefix),
969e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100970Can be set multiple times.
971
Akronc13a1702016-03-15 19:33:14 +0100972=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100973
Akronf7ad89e2016-03-16 18:22:47 +0100974Convert specific annotations by specifying the foundry
975(and optionally the layer with a C<#>-prefix),
976e.g. C<Mate> or C<Mate#Morpho>.
977Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100978
979=item B<--primary|-p>
980
Akronc13a1702016-03-15 19:33:14 +0100981Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100982Can be flagged using C<--no-primary> as well.
983This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100984
985=item B<--jobs|-j>
986
987Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100988for archive processing.
Akron11c80302016-03-18 19:44:43 +0100989Defaults to C<0> (everything runs in a single process).
Akronc11f7982017-02-21 21:20:14 +0100990Pass -1, and the value will be set automatically to 5
991times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +0100992This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100993
Akron35db6e32016-03-17 22:42:22 +0100994=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100995
Akron35db6e32016-03-17 22:42:22 +0100996Define the metadata parser to use. Defaults to C<I5>.
997Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
998This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100999
1000=item B<--pretty|-y>
1001
Akronc13a1702016-03-15 19:33:14 +01001002Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001003This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001004
1005=item B<--gzip|-z>
1006
Akronf7ad89e2016-03-16 18:22:47 +01001007Compress the output.
1008Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001009
Akron11c80302016-03-18 19:44:43 +01001010=item B<--cache|-c>
1011
1012File to mmap a cache (using L<Cache::FastMmap>).
1013Defaults to C<korapxml2krill.cache> in the calling directory.
1014
1015=item B<--cache-size|-cs>
1016
1017Size of the cache. Defaults to C<50m>.
1018
1019=item B<--cache-init|-ci>
1020
1021Initialize cache file.
1022Can be flagged using C<--no-cache-init> as well.
1023Defaults to C<true>.
1024
1025=item B<--cache-delete|-cd>
1026
1027Delete cache file after processing.
1028Can be flagged using C<--no-cache-delete> as well.
1029Defaults to C<true>.
1030
Akron636aa112017-04-07 18:48:56 +02001031=item B<--config|-cfg>
1032
1033Configure the parameters of your call in a file
1034of key-value pairs with whitespace separator
1035
1036 overwrite 1
1037 token DeReKo#Structure
1038 ...
1039
1040Supported parameters are:
1041C<overwrite>, C<gzip>, C<jobs>,
1042C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron81500102017-04-07 20:45:44 +02001043C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
Akron636aa112017-04-07 18:48:56 +02001044C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
1045(semicolon separated), C<anno> (semicolon separated).
1046
Akron81500102017-04-07 20:45:44 +02001047=item B<--temporary-extract|-te>
1048
1049Only valid for the C<archive> command.
1050
1051This will first extract all files into a
1052directory and then will archive.
1053If the directory is given as C<:temp:>,
1054a temporary directory is used.
1055This is especially useful to avoid
1056massive unzipping and potential
1057network latency.
Akron636aa112017-04-07 18:48:56 +02001058
Akrone10ad322016-02-27 10:54:26 +01001059=item B<--sigle|-sg>
1060
Akron20807582016-10-26 17:11:34 +02001061Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001062Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001063I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001064Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001065In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001066On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001067
Akron941c1a62016-02-23 17:41:41 +01001068=item B<--log|-l>
1069
1070The L<Log4perl> log level, defaults to C<ERROR>.
1071
1072=item B<--help|-h>
1073
1074Print this document.
1075
1076=item B<--version|-v>
1077
1078Print version information.
1079
1080=back
1081
Akronc13a1702016-03-15 19:33:14 +01001082=head1 ANNOTATION SUPPORT
1083
1084L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1085developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1086The base foundry with paragraphs, sentences, and the text element are mandatory for
1087L<Krill|https://github.com/KorAP/Krill>.
1088
Akron821db3d2017-04-06 21:19:31 +02001089 Base
1090 #Paragraphs
1091 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001092
Akron821db3d2017-04-06 21:19:31 +02001093 Connexor
1094 #Morpho
1095 #Phrase
1096 #Sentences
1097 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001098
Akron821db3d2017-04-06 21:19:31 +02001099 CoreNLP
1100 #Constituency
1101 #Morpho
1102 #NamedEntities
1103 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001104
Akron821db3d2017-04-06 21:19:31 +02001105 DeReKo
1106 #Structure
Akronc13a1702016-03-15 19:33:14 +01001107
Akron821db3d2017-04-06 21:19:31 +02001108 DRuKoLa
1109 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001110
Akron821db3d2017-04-06 21:19:31 +02001111 Glemm
1112 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001113
Akron821db3d2017-04-06 21:19:31 +02001114 Malt
1115 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001116
Akron821db3d2017-04-06 21:19:31 +02001117 MarMoT
1118 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001119
Akron821db3d2017-04-06 21:19:31 +02001120 Mate
1121 #Dependency
1122 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001123
Akron821db3d2017-04-06 21:19:31 +02001124 MDParser
1125 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001126
Akron821db3d2017-04-06 21:19:31 +02001127 OpenNLP
1128 #Morpho
1129 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001130
Akron821db3d2017-04-06 21:19:31 +02001131 Sgbr
1132 #Lemma
1133 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001134
Akron821db3d2017-04-06 21:19:31 +02001135 TreeTagger
1136 #Morpho
1137 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001138
Akron821db3d2017-04-06 21:19:31 +02001139 XIP
1140 #Constituency
1141 #Morpho
1142 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001143
Akronc13a1702016-03-15 19:33:14 +01001144
1145More importers are in preparation.
1146New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1147See the built-in annotation importers as examples.
1148
Akron941c1a62016-02-23 17:41:41 +01001149=head1 AVAILABILITY
1150
1151 https://github.com/KorAP/KorAP-XML-Krill
1152
1153
1154=head1 COPYRIGHT AND LICENSE
1155
Akron3ec0a1c2017-01-18 14:41:55 +01001156Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001157
Akron941c1a62016-02-23 17:41:41 +01001158Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001159
Akrona76d8352016-10-27 16:27:32 +02001160Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001161
1162L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1163Corpus Analysis Platform at the
1164L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1165member of the
1166L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1167
1168This program is free software published under the
1169L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1170
1171=cut