blob: a439fff185d2e852ede95f19d3ea67a579b9df34 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akronc11f7982017-02-21 21:20:14 +010025
26# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010027# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010028# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010029
Akronc11f7982017-02-21 21:20:14 +010030# TODO: Use KorAP::XML::ForkPool!
31
Akron941c1a62016-02-23 17:41:41 +010032# CHANGES:
33# ----------------------------------------------------------
34# 2013/11/25
35# - Initial release
36#
37# 2014/10/29
38# - Merges foundry data to create indexer friendly documents
39#
Akron93d620e2016-02-05 19:40:05 +010040# 2016/02/04
41# - renamed to korapxml2krill
42# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010043#
44# 2016/02/12
45# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010046# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010047#
48# 2016/02/14
49# - Added version information
Akron941c1a62016-02-23 17:41:41 +010050# - Added support for archive files
51#
52# 2016/02/15
53# - Fixed temporary directory bug
54# - Improved skipping before unzipping
55# - Added EXPERIMENTAL concurrency support
56#
57# 2016/02/23
58# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010059#
60# 2016/02/27
61# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010062#
63# 2016/03/17
64# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010065#
66# 2016/03/18
67# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020068#
Akronf3f0c942016-06-27 13:27:14 +020069# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020070# - Added multi archive support
71# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020072# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020073#
74# 2016/07/06
75# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020076#
77# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020078# - Fixed temporary path issue in script
79#
80# 2016/10/24
81# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020082#
Akronb4bbec72016-10-26 20:21:02 +020083# 2016/10/24
84# - Added support for document extraction
85#
Akron3741f8b2016-12-21 19:55:21 +010086# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020087# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020088#
Akron3741f8b2016-12-21 19:55:21 +010089# 2016/12/21
90# - added support for base-sentences and base-tokenizations
91#
Akron4fa37c32017-01-20 14:43:10 +010092# 2017/01/20
93# - added support for DRuKoLa annotations
94#
Akron41ac10b2017-02-08 22:47:25 +010095# 2017/02/08
96# - added support for pagebreak annotations
97#
Akron821db3d2017-04-06 21:19:31 +020098# 2017/04/06
99# - added support for wildcards in input
100#
Akron636aa112017-04-07 18:48:56 +0200101# 2017/04/07
102# - support configuration option
103#
Akron941c1a62016-02-23 17:41:41 +0100104# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100105
Akron636aa112017-04-07 18:48:56 +0200106our $LAST_CHANGE = '2017/04/07';
Akron941c1a62016-02-23 17:41:41 +0100107our $LOCAL = $FindBin::Bin;
108our $VERSION_MSG = <<"VERSION";
109Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
110VERSION
111
Akron941c1a62016-02-23 17:41:41 +0100112# Parse comand
113my $cmd;
114our @ARGV;
115if ($ARGV[0] && index($ARGV[0], '-') != 0) {
116 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100117};
Akron93d620e2016-02-05 19:40:05 +0100118
Akron5f51d422016-08-16 16:26:43 +0200119my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100120my $text;
Akrone10ad322016-02-27 10:54:26 +0100121
Akron941c1a62016-02-23 17:41:41 +0100122# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000123GetOptions(
Akron08385f62016-03-22 20:37:04 +0100124 'input|i=s' => \@input,
Akron941c1a62016-02-23 17:41:41 +0100125 'output|o=s' => \(my $output),
126 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100127 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200128 'token|t=s' => \(my $token_base),
129 'base-sentences|bs=s' => \(my $base_sentences),
130 'base-paragraphs|bp=s' => \(my $base_paragraphs),
131 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100132 'gzip|z' => \(my $gzip),
Akrone10ad322016-02-27 10:54:26 +0100133 'skip|s=s' => \@skip,
134 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200135 'cache|c=s' => \(my $cache_file),
136 'config|cfg=s' => \(my $cfg_file),
137 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200138 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100139 'primary|p!' => \(my $primary),
140 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200141 'jobs|j=i' => \(my $jobs),
142 'cache-size|cs=s' => \(my $cache_size),
143 'cache-delete|cd!' => \(my $cache_delete),
144 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100145 'help|h' => sub {
146 pod2usage(
147 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200148 -verbose => 99,
149 -msg => $VERSION_MSG,
150 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100151 );
152 },
153 'version|v' => sub {
154 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200155 -verbose => 0,
156 -msg => $VERSION_MSG,
157 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100158 )
159 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000160);
161
Akron636aa112017-04-07 18:48:56 +0200162# Load from configuration
163if ($cfg_file && -e $cfg_file) {
164
165 print "Reading config from $cfg_file\n";
166
167 my %config;
168
169 Config::Simple->import_from($cfg_file, \%config);
170
171 # Overwrite
172 if (!defined($overwrite) && defined $config{overwrite}) {
173 $overwrite = $config{overwrite};
174 };
175
176 # Gzip
177 if (!defined($gzip) && defined $config{gzip}) {
178 $gzip = $config{gzip};
179 };
180
181 # Jobs
182 if (!defined($jobs) && defined $config{jobs}) {
183 $jobs = $config{jobs};
184 };
185
186 # Token base
187 if (!defined($token_base) && defined $config{token}) {
188 $token_base = $config{token};
189 };
190
191 # Cache file
192 if (!defined($cache_file) && defined $config{cache}) {
193 $cache_file = $config{cache};
194 };
195
196 # Cache size
197 if (!defined($cache_size) && defined $config{'cache-size'}) {
198 $cache_size = $config{'cache-size'};
199 };
200
201 # Cache delete
202 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
203 $cache_delete = $config{'cache-delete'} ;
204 };
205
206 # Cache init
207 if (!(defined $cache_init) && defined $config{'cache-init'}) {
208 $cache_init = $config{'cache-init'} ;
209 };
210
211 # Meta
212 if (!(defined $meta) && defined $config{'meta'}) {
213 $meta = $config{'meta'} ;
214 };
215
216 # Output
217 if (!(defined $output) && defined $config{'output'}) {
218 $output = $config{'output'} ;
219 };
220
221 # Base-sentences
222 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
223 $base_sentences = $config{'base-sentences'} ;
224 };
225
226 # Base-paragraphs
227 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
228 $base_paragraphs = $config{'base-paragraphs'} ;
229 };
230
231 # Base-pagebreaks
232 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
233 $base_pagebreaks = $config{'base-pagebreaks'} ;
234 };
235
236 # Log
237 if (!(defined $log_level) && defined $config{'log'}) {
238 $log_level = $config{'log'} ;
239 };
240
241 # Skip
242 if (!scalar(@skip) && defined $config{'skip'}) {
243 @skip = split /\s*;\s*/, $config{'skip'} ;
244 };
245
246 # Sigle
247 if (!scalar(@sigle) && defined $config{'sigle'}) {
248 @sigle = split /\s*;\s*/, $config{'sigle'} ;
249 };
250
251 # Anno
252 if (!scalar(@anno) && defined $config{'anno'}) {
253 @anno = split /\s*;\s*/, $config{'anno'} ;
254 };
255};
256
257# Set default token base
258$token_base //= 'OpenNLP#tokens';
259$cache_file //= 'korapxml2krill.cache';
260$cache_size //= '50m';
261$jobs //= 0;
262$cache_delete //= 1;
263$cache_init //= 1;
264$log_level //= 'ERROR';
265$base_sentences //= '';
266$base_paragraphs //= '';
267$base_pagebreaks //= '';
268
Akron821db3d2017-04-06 21:19:31 +0200269$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100270$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100271$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100272
Akron941c1a62016-02-23 17:41:41 +0100273my %ERROR_HASH = (
274 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200275 -verbose => 99,
276 -msg => $VERSION_MSG,
277 -output => '-',
278 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100279);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000280
Akron941c1a62016-02-23 17:41:41 +0100281# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100282pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000283
Akrone1dbc382016-07-08 22:24:52 +0200284# Gzip has no effect, if no output is given
285pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000286
Akron941c1a62016-02-23 17:41:41 +0100287# Initialize log4perl object
Nils Diewald7364d1f2013-11-05 19:26:35 +0000288Log::Log4perl->init({
289 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
290 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
291 'log4perl.appender.STDERR.layout' => 'PatternLayout',
292 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
293});
294
295my $log = Log::Log4perl->get_logger('main');
296
Akronc11f7982017-02-21 21:20:14 +0100297
Akron636aa112017-04-07 18:48:56 +0200298if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100299 state $cores = Sys::Info->new->device('CPU')->count;
300 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200301 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100302};
303
Akron821db3d2017-04-06 21:19:31 +0200304
Akrone1dbc382016-07-08 22:24:52 +0200305my %skip;
306$skip{lc($_)} = 1 foreach @skip;
307
308my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100309push(@layers, ['Base', 'Sentences']) unless $base_sentences;
310push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200311
312# Connexor
313push(@layers, ['Connexor', 'Morpho']);
314push(@layers, ['Connexor', 'Syntax']);
315push(@layers, ['Connexor', 'Phrase']);
316push(@layers, ['Connexor', 'Sentences']);
317
318# CoreNLP
319push(@layers, ['CoreNLP', 'NamedEntities']);
320push(@layers, ['CoreNLP', 'Sentences']);
321push(@layers, ['CoreNLP', 'Morpho']);
322push(@layers, ['CoreNLP', 'Constituency']);
323
Akron3741f8b2016-12-21 19:55:21 +0100324
Akrone1dbc382016-07-08 22:24:52 +0200325# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100326my @dereko_attr = ();
327if ($base_sentences eq 'dereko#structure') {
328 push @dereko_attr, 'sentences';
329};
330if ($base_paragraphs eq 'dereko#structure') {
331 push @dereko_attr, 'paragraphs';
332};
Akron636bd9c2017-02-09 17:13:00 +0100333
Akron41ac10b2017-02-08 22:47:25 +0100334if ($base_pagebreaks eq 'dereko#structure') {
335 push @dereko_attr, 'pagebreaks';
336};
337
338if ($dereko_attr[0]) {
339 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100340}
341else {
342 push(@layers, ['DeReKo', 'Structure']);
343};
Akrone1dbc382016-07-08 22:24:52 +0200344
345# Glemm
346push(@layers, ['Glemm', 'Morpho']);
347
348# Malt
349push(@layers, ['Malt', 'Dependency']);
350
351# MDParser
352push(@layers, ['MDParser', 'Dependency']);
353
354# Mate
355push(@layers, ['Mate', 'Morpho']);
356push(@layers, ['Mate', 'Dependency']);
357
358# OpenNLP
359push(@layers, ['OpenNLP', 'Morpho']);
360push(@layers, ['OpenNLP', 'Sentences']);
361
362# Schreibgebrauch
363push(@layers, ['Sgbr', 'Lemma']);
364push(@layers, ['Sgbr', 'Morpho']);
365
366# TreeTagger
367push(@layers, ['TreeTagger', 'Morpho']);
368push(@layers, ['TreeTagger', 'Sentences']);
369
370# XIP
371push(@layers, ['XIP', 'Morpho']);
372push(@layers, ['XIP', 'Constituency']);
373push(@layers, ['XIP', 'Sentences']);
374push(@layers, ['XIP', 'Dependency']);
375
Akron4fa37c32017-01-20 14:43:10 +0100376# DRuKoLa
377push(@layers, ['DRuKoLa', 'Morpho']);
378
Akron3bd942f2017-02-20 20:09:14 +0100379# Marmot
380push(@layers, ['MarMoT', 'Morpho']);
381
Akron4fa37c32017-01-20 14:43:10 +0100382
Akrone1dbc382016-07-08 22:24:52 +0200383# Check filters
384my @filtered_anno;
385if ($skip{'#all'}) {
386 foreach (@anno) {
387 push @filtered_anno, [ split('#', $_) ];
388 };
389}
390
391# Add all annotations that are not skipped
392else {
393 # Add to index file - respect skipping
394 foreach my $info (@layers) {
395 # Skip if Foundry or Foundry#Layer should be skipped
396 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
397 push @filtered_anno, $info;
398 };
399 };
400};
401
402# Get tokenization basis
403my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
404
405# TODO: This should not be initialized for batch
406my $cache = Cache::FastMmap->new(
407 share_file => $cache_file,
408 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200409 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200410);
411
Akron03b24db2016-08-16 20:54:32 +0200412# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200413my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200414 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200415 meta_type => $meta,
416 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200417 foundry => $token_base_foundry,
418 layer => $token_base_layer,
419 gzip => $gzip,
420 log => $log,
421 primary => $primary,
422 pretty => $pretty,
423 anno => \@filtered_anno
Akrone1dbc382016-07-08 22:24:52 +0200424);
425
Akron941c1a62016-02-23 17:41:41 +0100426
427# Get file name based on path information
428sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100429 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200430 if (-d $i) {
431 $i =~ s![^\/]+$!!;
432 };
Akron941c1a62016-02-23 17:41:41 +0100433 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200434
435 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200436 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100437 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100438 $file =~ tr/\//-/;
439 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200440 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100441 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000442};
443
Akrone10ad322016-02-27 10:54:26 +0100444# Convert sigle to path construct
445s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
446
Akron7d4cdd82016-08-17 21:39:45 +0200447if ($cmd) {
448 if ($output && (!-e $output || !-d $output)) {
449 print "Directory '$output' does not exist.\n\n";
450 exit(0);
451 };
452};
453
454
Akron821db3d2017-04-06 21:19:31 +0200455# Glob files
456if (@input) {
457 my @new_input = ();
458
459 # Iterate over all inputs
460 foreach (@input) {
461 push (@new_input, bsd_glob($_));
462 };
463
464 if (scalar(@new_input) > scalar(@input)) {
465 @input = sort { length($a) <=> length($b) } @new_input;
Akron636aa112017-04-07 18:48:56 +0200466 print 'Input rewritten to ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200467 };
468};
469
470
Akron941c1a62016-02-23 17:41:41 +0100471# Process a single file
472unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100473 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000474
Akron941c1a62016-02-23 17:41:41 +0100475 BEGIN {
476 $main::TIME = Benchmark->new;
477 $main::LAST_STOP = Benchmark->new;
478 };
479
480 sub stop_time {
481 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200482 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100483 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200484 timestr(timediff($new, $main::LAST_STOP)) .
485 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
486 );
Akron941c1a62016-02-23 17:41:41 +0100487 $main::LAST_STOP = $new;
488 };
489
490 # Create and parse new document
491 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100492
Akron7d4cdd82016-08-17 21:39:45 +0200493 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200494 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100495
Akron11c80302016-03-18 19:44:43 +0100496 # Delete cache file
497 unlink($cache_file) if $cache_delete;
498
Akron5f51d422016-08-16 16:26:43 +0200499 stop_time;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000500}
Nils Diewald59094f22014-11-05 18:20:50 +0000501
Akrone10ad322016-02-27 10:54:26 +0100502# Extract XML files
503elsif ($cmd eq 'extract') {
504
Akron7d4cdd82016-08-17 21:39:45 +0200505 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200506 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100507
Akron7d4cdd82016-08-17 21:39:45 +0200508 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100509 unless ($archive->test_unzip) {
510 print "Unzip is not installed or incompatible.\n\n";
511 exit(1);
512 };
513
Akronb0c88db2016-06-29 16:33:18 +0200514 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200515 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200516
Akron651cb8d2016-08-16 21:44:49 +0200517 my $prefix = 1;
518
Akron03b24db2016-08-16 20:54:32 +0200519 # No sigles given
520 unless (@sigle) {
521
522 # Get files
523 foreach ($archive->list_texts) {
524
525 # Split path information
Akron651cb8d2016-08-16 21:44:49 +0200526 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron03b24db2016-08-16 20:54:32 +0200527
528 # TODO: Make this OS independent
529 push @sigle, join '/', $corpus, $doc, $text;
530 };
Akron20807582016-10-26 17:11:34 +0200531 }
532
533 # Check sigle for doc sigles
534 else {
535 my @new_sigle;
536
537 my $prefix_check = 0;
538
539 # Iterate over all sigle
540 foreach (@sigle) {
541
542 # Sigle is a doc sigle
543 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron20807582016-10-26 17:11:34 +0200544
Akron60a8caa2017-02-17 21:51:27 +0100545 print "$_ ...";
Akron20807582016-10-26 17:11:34 +0200546 # Check if a prefix is needed
547 unless ($prefix_check) {
Akron60a8caa2017-02-17 21:51:27 +0100548
549 if ($prefix = $archive->check_prefix) {
550 print " with prefix ...";
551 };
Akron20807582016-10-26 17:11:34 +0200552 $prefix_check = 1;
553 };
554
Akron60a8caa2017-02-17 21:51:27 +0100555 print "\n";
556
Akron20807582016-10-26 17:11:34 +0200557 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200558 my $path = ($prefix ? './' : '') . $_;
559
560 print '... ' . (
Akron20807582016-10-26 17:11:34 +0200561 $archive->extract_doc(
Akron2812ba22016-10-28 21:55:59 +0200562 $path, $output, $jobs
Akron20807582016-10-26 17:11:34 +0200563 ) ? '' : 'not '
564 );
565 print "extracted.\n";
566 }
Akron60a8caa2017-02-17 21:51:27 +0100567
568 # Sigle is a text sigle
Akron20807582016-10-26 17:11:34 +0200569 else {
570 push @new_sigle, $_;
Akron60a8caa2017-02-17 21:51:27 +0100571
572 unless ($prefix_check) {
573
574 if ($prefix = $archive->check_prefix) {
575 print " with prefix ...";
576 };
577 $prefix_check = 1;
578 };
Akron20807582016-10-26 17:11:34 +0200579 };
580 };
581 @sigle = @new_sigle;
Akron03b24db2016-08-16 20:54:32 +0200582 };
583
Akrone10ad322016-02-27 10:54:26 +0100584 # Iterate over all given sigles and extract
585 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100586
Akron2812ba22016-10-28 21:55:59 +0200587 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200588
Akron03b24db2016-08-16 20:54:32 +0200589 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200590 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100591
Akron20807582016-10-26 17:11:34 +0200592 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200593 ($prefix ? './' : '') . $_, $output
594 ) ? '' : 'not '
595 );
Akrone10ad322016-02-27 10:54:26 +0100596 print "extracted.\n";
597 };
598
599 print "\n";
600 exit(1);
Akronb0c88db2016-06-29 16:33:18 +0200601 }
Akron7d4cdd82016-08-17 21:39:45 +0200602
603 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200604 else {
605 $log->error('Unable to extract from primary archive ' . $input[0]);
Akrone10ad322016-02-27 10:54:26 +0100606 };
607}
608
Akron941c1a62016-02-23 17:41:41 +0100609# Process an archive
610elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000611
Akrone1dbc382016-07-08 22:24:52 +0200612 # TODO: Support sigles
Akron941c1a62016-02-23 17:41:41 +0100613
Akron7d4cdd82016-08-17 21:39:45 +0200614 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100615 my $pool = Parallel::ForkManager->new($jobs);
616
Akron7d4cdd82016-08-17 21:39:45 +0200617 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100618 my $iter = 1; # Current text in process
619
620 # Report on fork message
621 $pool->run_on_finish (
622 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200623 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100624 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200625
Akron08385f62016-03-22 20:37:04 +0100626 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200627 ($iter++) . "/$count]" .
628 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200629 ' ' . $data->[0] . "\n";
630 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100631 }
632 );
633
634 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200635 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100636 print "Reading data ...\n";
637
Akron7d4cdd82016-08-17 21:39:45 +0200638 # unless (Cache::FastMmap->new(
639 # share_file => $cache_file,
640 # cache_size => $cache_size,
641 # init_file => $cache_init
642 # )) {
643 # print "Unable to intialize cache '$cache_file'\n\n";
644 # exit(1);
645 # };
Akron11c80302016-03-18 19:44:43 +0100646
Akron941c1a62016-02-23 17:41:41 +0100647 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100648 if (-d $input[0]) {
649 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100650 my @dirs;
651 my $dir;
652
Akron7d4cdd82016-08-17 21:39:45 +0200653 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100654 while (1) {
655 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200656 push @dirs, $dir;
657 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100658 };
659 last unless $it->next;
660 };
661
662 print "Start processing ...\n";
663 $t = Benchmark->new;
664 $count = scalar @dirs;
665
666 DIRECTORY_LOOP:
667 for (my $i = 0; $i < $count; $i++) {
668
Akrone1dbc382016-07-08 22:24:52 +0200669 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200670 $output,
671 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200672 );
Akron941c1a62016-02-23 17:41:41 +0100673
674 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200675 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200676
Akron13d56622016-10-31 14:54:49 +0100677 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
678 $pool->finish(
679 0,
680 ["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
681 );
Akron3ec48972016-08-17 23:24:52 +0200682 }
683 else {
Akron4c0cf312016-10-15 16:42:09 +0200684 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200685 };
Akron941c1a62016-02-23 17:41:41 +0100686 };
687 }
688
689 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200690 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200691
Akron941c1a62016-02-23 17:41:41 +0100692 unless ($archive->test_unzip) {
693 print "Unzip is not installed or incompatible.\n\n";
694 exit(1);
695 };
696
Akron08385f62016-03-22 20:37:04 +0100697 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200698 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100699
Akron941c1a62016-02-23 17:41:41 +0100700 print "Start processing ...\n";
701 $t = Benchmark->new;
702 my @dirs = $archive->list_texts;
703 $count = scalar @dirs;
704
705 ARCHIVE_LOOP:
706 for (my $i = 0; $i < $count; $i++) {
707
708 # Split path information
709 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
710
Akrone1dbc382016-07-08 22:24:52 +0200711 my $filename = catfile(
Akron7d4cdd82016-08-17 21:39:45 +0200712 $output,
713 get_file_name(
714 catfile($corpus, $doc, $text)
715 . '.json' . ($gzip ? '.gz' : '')
716 )
Akrone1dbc382016-07-08 22:24:52 +0200717 );
Akron941c1a62016-02-23 17:41:41 +0100718
719 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200720 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100721
Akron4c0cf312016-10-15 16:42:09 +0200722 # Create temporary file
723 $temp = File::Temp->newdir;
724
Akronbdf434a2016-10-24 17:42:07 +0200725 # TODO: Check if $filename exist at the beginning,
726 # because extraction can be horrible slow!
727
Akron941c1a62016-02-23 17:41:41 +0100728 # Extract from archive
Akron20807582016-10-26 17:11:34 +0200729 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +0100730
Akron7d4cdd82016-08-17 21:39:45 +0200731 # Create corpus directory
732 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100733
Akron7d4cdd82016-08-17 21:39:45 +0200734 # Temporary directory
735 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100736
Akron7d4cdd82016-08-17 21:39:45 +0200737 # Write file
Akron13d56622016-10-31 14:54:49 +0100738 if (my $return = $batch_file->process($dir => $filename)) {
Akron4c0cf312016-10-15 16:42:09 +0200739 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100740 $pool->finish(
741 0,
742 ["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
743 );
744 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200745 }
746 else {
Akron4c0cf312016-10-15 16:42:09 +0200747 # Delete temporary file
748 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200749 };
Akron941c1a62016-02-23 17:41:41 +0100750 }
Akron7d4cdd82016-08-17 21:39:45 +0200751
752 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100753 else {
Akron4c0cf312016-10-15 16:42:09 +0200754 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100755 };
756 };
757 }
758
759 else {
760 print "Input is neither a directory nor an archive.\n\n";
761 };
762
763 $pool->wait_all_children;
764
Akron11c80302016-03-18 19:44:43 +0100765 # Delete cache file
766 unlink($cache_file) if $cache_delete;
767
Akron941c1a62016-02-23 17:41:41 +0100768 print "Done.\n";
769 print timestr(timediff(Benchmark->new, $t))."\n\n";
770}
771
772# Unknown command
773else {
774 warn "Unknown command '$cmd'.\n\n";
775 pod2usage(%ERROR_HASH);
776}
Nils Diewald2db9ad02013-10-29 19:26:43 +0000777
778__END__
Akron941c1a62016-02-23 17:41:41 +0100779
780=pod
781
782=encoding utf8
783
784=head1 NAME
785
Akronf7ad89e2016-03-16 18:22:47 +0100786korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100787
788
789=head1 SYNOPSIS
790
Akrona76d8352016-10-27 16:27:32 +0200791 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100792
Akron2fd402b2016-10-27 21:26:48 +0200793
Akron941c1a62016-02-23 17:41:41 +0100794=head1 DESCRIPTION
795
796L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
797compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +0100798The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +0100799
800
801=head1 INSTALLATION
802
803The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
804
Akronaf386982016-10-12 00:33:25 +0200805 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +0100806
Akronc13a1702016-03-15 19:33:14 +0100807In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +0100808be available on your command line immediately.
Akron74381512016-10-14 11:56:22 +0200809Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akrona93d51b2016-10-24 20:27:48 +0200810In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +0100811
812=head1 ARGUMENTS
813
Akrona76d8352016-10-27 16:27:32 +0200814 $ korapxml2krill -z --input <directory> --output <filename>
815
816Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +0200817It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +0200818
Akron941c1a62016-02-23 17:41:41 +0100819=over 2
820
821=item B<archive>
822
Akrona76d8352016-10-27 16:27:32 +0200823 $ korapxml2krill archive -z --input <directory|archive> --output <directory>
824
Akron2fd402b2016-10-27 21:26:48 +0200825Converts an archive of KorAP-XML documents. It expects a directory
826(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +0100827
828=item B<extract>
829
Akrona76d8352016-10-27 16:27:32 +0200830 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
831
832Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +0100833
834=back
835
836
837=head1 OPTIONS
838
839=over 2
840
Akrona76d8352016-10-27 16:27:32 +0200841=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +0100842
Akrona76d8352016-10-27 16:27:32 +0200843Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +0100844
Akron7606afa2016-10-25 16:23:49 +0200845Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +0100846document, while C<archive> expects a KorAP-XML corpus folder or a zip
847file to batch process multiple files.
848C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +0200849
Akrona76d8352016-10-27 16:27:32 +0200850C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +0200851that the first archive listed contains all primary data files
852and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +0200853
Akron7606afa2016-10-25 16:23:49 +0200854 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +0200855
Akron821db3d2017-04-06 21:19:31 +0200856Input may also be defined using BSD glob wildcards.
857
858 -i 'file/news*.zip'
859
860The extended input array will be sorted in length order, so the shortest
861path needs to contain all primary data files and all meta data files.
862
Akron0c3e3752016-06-28 15:55:53 +0200863(The directory structure follows the base directory format,
864that may include a C<.> root folder.
865In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +0200866need to be passed with a hash sign in front of the archive's name.
867This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +0200868
Akron7606afa2016-10-25 16:23:49 +0200869To support zip files, a version of C<unzip> needs to be installed that is
870compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +0200871
Akron7606afa2016-10-25 16:23:49 +0200872B<The root folder switch using the hash sign is experimental and
873may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +0200874
Akron941c1a62016-02-23 17:41:41 +0100875=item B<--output|-o> <directory|file>
876
877Output folder for archive processing or
878document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +0100879writes to C<STDOUT> by default
880(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +0100881
882=item B<--overwrite|-w>
883
884Overwrite files that already exist.
885
Akron3741f8b2016-12-21 19:55:21 +0100886=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +0100887
888Define the default tokenization by specifying
889the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +0100890of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +0100891
Akron3741f8b2016-12-21 19:55:21 +0100892
893=item B<--base-sentences|-bs> <foundry>#<layer>
894
895Define the layer for base sentences.
896If given, this will be used instead of using C<Base#Sentences>.
897Currently C<DeReKo#Structure> is the only additional layer supported.
898
899 Defaults to unset.
900
901
902=item B<--base-paragraphs|-bp> <foundry>#<layer>
903
904Define the layer for base paragraphs.
905If given, this will be used instead of using C<Base#Paragraphs>.
906Currently C<DeReKo#Structure> is the only additional layer supported.
907
908 Defaults to unset.
909
910
Akron41ac10b2017-02-08 22:47:25 +0100911=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
912
913Define the layer for base pagebreaks.
914Currently C<DeReKo#Structure> is the only layer supported.
915
916 Defaults to unset.
917
918
Akron941c1a62016-02-23 17:41:41 +0100919=item B<--skip|-s> <foundry>[#<layer>]
920
Akronf7ad89e2016-03-16 18:22:47 +0100921Skip specific annotations by specifying the foundry
922(and optionally the layer with a C<#>-prefix),
923e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +0100924Can be set multiple times.
925
Akronc13a1702016-03-15 19:33:14 +0100926=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +0100927
Akronf7ad89e2016-03-16 18:22:47 +0100928Convert specific annotations by specifying the foundry
929(and optionally the layer with a C<#>-prefix),
930e.g. C<Mate> or C<Mate#Morpho>.
931Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +0100932
933=item B<--primary|-p>
934
Akronc13a1702016-03-15 19:33:14 +0100935Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +0100936Can be flagged using C<--no-primary> as well.
937This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100938
939=item B<--jobs|-j>
940
941Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +0100942for archive processing.
Akron11c80302016-03-18 19:44:43 +0100943Defaults to C<0> (everything runs in a single process).
Akronc11f7982017-02-21 21:20:14 +0100944Pass -1, and the value will be set automatically to 5
945times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +0100946This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100947
Akron35db6e32016-03-17 22:42:22 +0100948=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +0100949
Akron35db6e32016-03-17 22:42:22 +0100950Define the metadata parser to use. Defaults to C<I5>.
951Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
952This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +0100953
954=item B<--pretty|-y>
955
Akronc13a1702016-03-15 19:33:14 +0100956Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +0100957This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +0100958
959=item B<--gzip|-z>
960
Akronf7ad89e2016-03-16 18:22:47 +0100961Compress the output.
962Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +0100963
Akron11c80302016-03-18 19:44:43 +0100964=item B<--cache|-c>
965
966File to mmap a cache (using L<Cache::FastMmap>).
967Defaults to C<korapxml2krill.cache> in the calling directory.
968
969=item B<--cache-size|-cs>
970
971Size of the cache. Defaults to C<50m>.
972
973=item B<--cache-init|-ci>
974
975Initialize cache file.
976Can be flagged using C<--no-cache-init> as well.
977Defaults to C<true>.
978
979=item B<--cache-delete|-cd>
980
981Delete cache file after processing.
982Can be flagged using C<--no-cache-delete> as well.
983Defaults to C<true>.
984
Akron636aa112017-04-07 18:48:56 +0200985=item B<--config|-cfg>
986
987Configure the parameters of your call in a file
988of key-value pairs with whitespace separator
989
990 overwrite 1
991 token DeReKo#Structure
992 ...
993
994Supported parameters are:
995C<overwrite>, C<gzip>, C<jobs>,
996C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
997C<output>, C<base-sentences>, C<base-paragraphs>,
998C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
999(semicolon separated), C<anno> (semicolon separated).
1000
1001
Akrone10ad322016-02-27 10:54:26 +01001002=item B<--sigle|-sg>
1003
Akron20807582016-10-26 17:11:34 +02001004Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001005Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001006I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001007Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001008In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001009On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001010
Akron941c1a62016-02-23 17:41:41 +01001011=item B<--log|-l>
1012
1013The L<Log4perl> log level, defaults to C<ERROR>.
1014
1015=item B<--help|-h>
1016
1017Print this document.
1018
1019=item B<--version|-v>
1020
1021Print version information.
1022
1023=back
1024
Akronc13a1702016-03-15 19:33:14 +01001025=head1 ANNOTATION SUPPORT
1026
1027L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1028developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1029The base foundry with paragraphs, sentences, and the text element are mandatory for
1030L<Krill|https://github.com/KorAP/Krill>.
1031
Akron821db3d2017-04-06 21:19:31 +02001032 Base
1033 #Paragraphs
1034 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001035
Akron821db3d2017-04-06 21:19:31 +02001036 Connexor
1037 #Morpho
1038 #Phrase
1039 #Sentences
1040 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001041
Akron821db3d2017-04-06 21:19:31 +02001042 CoreNLP
1043 #Constituency
1044 #Morpho
1045 #NamedEntities
1046 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001047
Akron821db3d2017-04-06 21:19:31 +02001048 DeReKo
1049 #Structure
Akronc13a1702016-03-15 19:33:14 +01001050
Akron821db3d2017-04-06 21:19:31 +02001051 DRuKoLa
1052 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001053
Akron821db3d2017-04-06 21:19:31 +02001054 Glemm
1055 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001056
Akron821db3d2017-04-06 21:19:31 +02001057 Malt
1058 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001059
Akron821db3d2017-04-06 21:19:31 +02001060 MarMoT
1061 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001062
Akron821db3d2017-04-06 21:19:31 +02001063 Mate
1064 #Dependency
1065 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001066
Akron821db3d2017-04-06 21:19:31 +02001067 MDParser
1068 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001069
Akron821db3d2017-04-06 21:19:31 +02001070 OpenNLP
1071 #Morpho
1072 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001073
Akron821db3d2017-04-06 21:19:31 +02001074 Sgbr
1075 #Lemma
1076 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001077
Akron821db3d2017-04-06 21:19:31 +02001078 TreeTagger
1079 #Morpho
1080 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001081
Akron821db3d2017-04-06 21:19:31 +02001082 XIP
1083 #Constituency
1084 #Morpho
1085 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001086
Akronc13a1702016-03-15 19:33:14 +01001087
1088More importers are in preparation.
1089New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1090See the built-in annotation importers as examples.
1091
Akron941c1a62016-02-23 17:41:41 +01001092=head1 AVAILABILITY
1093
1094 https://github.com/KorAP/KorAP-XML-Krill
1095
1096
1097=head1 COPYRIGHT AND LICENSE
1098
Akron3ec0a1c2017-01-18 14:41:55 +01001099Copyright (C) 2015-2017, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001100
Akron941c1a62016-02-23 17:41:41 +01001101Author: L<Nils Diewald|http://nils-diewald.de/>
Akrona76d8352016-10-27 16:27:32 +02001102Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001103
1104L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1105Corpus Analysis Platform at the
1106L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1107member of the
1108L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1109
1110This program is free software published under the
1111L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1112
1113=cut