blob: 1d011f25f27227ff8aada05fcee6f5f60cfa8de2 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron941c1a62016-02-23 17:41:41 +0100142# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100143
Akron9b04f602019-03-08 18:45:35 +0100144our $LAST_CHANGE = '2019/03/08';
Akron941c1a62016-02-23 17:41:41 +0100145our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100146our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100147our $VERSION_MSG = <<"VERSION";
148Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
149VERSION
150
Akron63f20d42017-04-10 23:40:29 +0200151# Prototypes
152sub get_file_name_from_glob($);
153sub get_file_name($);
154
Akron941c1a62016-02-23 17:41:41 +0100155# Parse comand
156my $cmd;
157our @ARGV;
158if ($ARGV[0] && index($ARGV[0], '-') != 0) {
159 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100160};
Akron63f20d42017-04-10 23:40:29 +0200161my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100162
Akron5f51d422016-08-16 16:26:43 +0200163my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100164my $text;
Akrone10ad322016-02-27 10:54:26 +0100165
Akron941c1a62016-02-23 17:41:41 +0100166# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000167GetOptions(
Akron08385f62016-03-22 20:37:04 +0100168 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200169 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100170 'output|o=s' => \(my $output),
171 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100172 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200173 'token|t=s' => \(my $token_base),
174 'base-sentences|bs=s' => \(my $base_sentences),
175 'base-paragraphs|bp=s' => \(my $base_paragraphs),
176 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100177 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200178 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100179 'skip|s=s' => \@skip,
180 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200181 'cache|c=s' => \(my $cache_file),
182 'config|cfg=s' => \(my $cfg_file),
183 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200184 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100185 'primary|p!' => \(my $primary),
186 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200187 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100188 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200189 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100190 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200191 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200192 'cache-size|cs=s' => \(my $cache_size),
193 'cache-delete|cd!' => \(my $cache_delete),
194 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100195 'help|h' => sub {
196 pod2usage(
197 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200198 -verbose => 99,
199 -msg => $VERSION_MSG,
200 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100201 );
202 },
203 'version|v' => sub {
204 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200205 -verbose => 0,
206 -msg => $VERSION_MSG,
207 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100208 )
209 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000210);
211
Akron63f20d42017-04-10 23:40:29 +0200212
Akron636aa112017-04-07 18:48:56 +0200213# Load from configuration
214if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200215 my %config;
216
217 Config::Simple->import_from($cfg_file, \%config);
218
219 # Overwrite
220 if (!defined($overwrite) && defined $config{overwrite}) {
221 $overwrite = $config{overwrite};
222 };
223
224 # Gzip
225 if (!defined($gzip) && defined $config{gzip}) {
226 $gzip = $config{gzip};
227 };
228
229 # Jobs
230 if (!defined($jobs) && defined $config{jobs}) {
231 $jobs = $config{jobs};
232 };
233
Akron263274c2019-02-07 09:48:30 +0100234 # Koral version
235 if (!defined($koral) && defined $config{koral}) {
236 $koral = $config{koral};
237 };
238
Akron63f20d42017-04-10 23:40:29 +0200239 # Input root base directory
240 if (!defined($input_base) && defined $config{'input-base'}) {
241 $input_base = $config{'input-base'};
242 };
243
Akron81500102017-04-07 20:45:44 +0200244 # temporary-extract
245 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
246 $extract_dir = $config{'temporary-extract'};
247 };
248
Akron636aa112017-04-07 18:48:56 +0200249 # Token base
250 if (!defined($token_base) && defined $config{token}) {
251 $token_base = $config{token};
252 };
253
Akroned9baf02019-01-22 17:03:25 +0100254 # temporary-extract
255 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
256 $non_word_tokens = $config{'non-word-tokens'};
257 };
258
Akron636aa112017-04-07 18:48:56 +0200259 # Cache file
260 if (!defined($cache_file) && defined $config{cache}) {
261 $cache_file = $config{cache};
262 };
263
264 # Cache size
265 if (!defined($cache_size) && defined $config{'cache-size'}) {
266 $cache_size = $config{'cache-size'};
267 };
268
269 # Cache delete
270 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
271 $cache_delete = $config{'cache-delete'} ;
272 };
273
274 # Cache init
275 if (!(defined $cache_init) && defined $config{'cache-init'}) {
276 $cache_init = $config{'cache-init'} ;
277 };
278
Akron9ec88872017-04-12 16:29:06 +0200279 # Jobs for extraction
280 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
281 $sequential_extraction = $config{'sequential-extraction'} ;
282 };
283
Akron636aa112017-04-07 18:48:56 +0200284 # Meta
285 if (!(defined $meta) && defined $config{'meta'}) {
286 $meta = $config{'meta'} ;
287 };
288
289 # Output
290 if (!(defined $output) && defined $config{'output'}) {
291 $output = $config{'output'} ;
292 };
293
294 # Base-sentences
295 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
296 $base_sentences = $config{'base-sentences'} ;
297 };
298
299 # Base-paragraphs
300 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
301 $base_paragraphs = $config{'base-paragraphs'} ;
302 };
303
304 # Base-pagebreaks
305 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
306 $base_pagebreaks = $config{'base-pagebreaks'} ;
307 };
308
Akron081639e2017-04-21 19:01:39 +0200309 # Write to tar
310 if (!(defined $to_tar) && defined $config{'to-tar'}) {
311 $to_tar = $config{'to-tar'} ;
312 };
313
Akron636aa112017-04-07 18:48:56 +0200314 # Log
315 if (!(defined $log_level) && defined $config{'log'}) {
316 $log_level = $config{'log'} ;
317 };
318
319 # Skip
320 if (!scalar(@skip) && defined $config{'skip'}) {
321 @skip = split /\s*;\s*/, $config{'skip'} ;
322 };
323
324 # Sigle
325 if (!scalar(@sigle) && defined $config{'sigle'}) {
326 @sigle = split /\s*;\s*/, $config{'sigle'} ;
327 };
328
329 # Anno
330 if (!scalar(@anno) && defined $config{'anno'}) {
331 @anno = split /\s*;\s*/, $config{'anno'} ;
332 };
333};
334
Akron63f20d42017-04-10 23:40:29 +0200335
Akron636aa112017-04-07 18:48:56 +0200336# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200337$token_base //= 'OpenNLP#tokens';
338$cache_file //= 'korapxml2krill.cache';
339$cache_size //= '50m';
340$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100341$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200342$cache_delete //= 1;
343$cache_init //= 1;
344$sequential_extraction //= 0;
345$log_level //= 'ERROR';
346$base_sentences //= '';
347$base_paragraphs //= '';
348$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100349$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200350
Akron821db3d2017-04-06 21:19:31 +0200351$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100352$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100353$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100354
Akron63f20d42017-04-10 23:40:29 +0200355
356# Initialize log4perl object
357Log::Log4perl->init({
358 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
359 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
360 'log4perl.appender.STDERR.layout' => 'PatternLayout',
361 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
362});
363
364my $log = Log::Log4perl->get_logger('main');
365
366
367print "Reading config from $cfg_file\n" if $cfg_file;
368
369
Akron941c1a62016-02-23 17:41:41 +0100370my %ERROR_HASH = (
371 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200372 -verbose => 99,
373 -msg => $VERSION_MSG,
374 -output => '-',
375 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100376);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000377
Akron941c1a62016-02-23 17:41:41 +0100378# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100379pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000380
Akrone1dbc382016-07-08 22:24:52 +0200381# Gzip has no effect, if no output is given
382pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000383
Akronc11f7982017-02-21 21:20:14 +0100384
Akron636aa112017-04-07 18:48:56 +0200385if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100386 state $cores = Sys::Info->new->device('CPU')->count;
387 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200388 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100389};
390
Akron821db3d2017-04-06 21:19:31 +0200391
Akron63f20d42017-04-10 23:40:29 +0200392# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200393if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200394
Akron486f9ab2017-04-22 23:25:19 +0200395 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200396 $log->error("Directory '$output' does not exist.");
397 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200398 };
399
400 # Remove all inputs
401 my $remove_next = 0;
402 @keep_argv = @{c(@keep_argv)->grep(
403 sub {
404 # Input flag
405 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
406 $remove_next = 1;
407 return 0;
408 }
409
410 # input value
411 elsif ($remove_next) {
412 $remove_next = 0;
413 return 0;
414 };
415
416 # Pass parameter
417 return 1;
418 }
419 )->to_array};
420
421
422 # Iterate over all inputs
423 foreach (@input) {
424
Akron081639e2017-04-21 19:01:39 +0200425 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200426 my $new_out = catdir($output, get_file_name_from_glob($_));
427
Akron486f9ab2017-04-22 23:25:19 +0200428 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200429 unless ($to_tar) {
430 if (make_path($new_out) == 0 && !-d $new_out) {
431 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200432 exit 1;
Akron081639e2017-04-21 19:01:39 +0200433 };
Akron63f20d42017-04-10 23:40:29 +0200434 };
435
436 # Create archive command
437 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
438 print "Start serial processing of $_ to $new_out\n";
439
440 # Start archiving
441 system @archive_cmd;
442 };
443
Akron3abc03e2017-06-29 16:23:35 +0200444 exit;
Akron63f20d42017-04-10 23:40:29 +0200445};
446
Akrone1dbc382016-07-08 22:24:52 +0200447my %skip;
448$skip{lc($_)} = 1 foreach @skip;
449
450my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100451push(@layers, ['Base', 'Sentences']) unless $base_sentences;
452push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200453
454# Connexor
455push(@layers, ['Connexor', 'Morpho']);
456push(@layers, ['Connexor', 'Syntax']);
457push(@layers, ['Connexor', 'Phrase']);
458push(@layers, ['Connexor', 'Sentences']);
459
460# CoreNLP
461push(@layers, ['CoreNLP', 'NamedEntities']);
462push(@layers, ['CoreNLP', 'Sentences']);
463push(@layers, ['CoreNLP', 'Morpho']);
464push(@layers, ['CoreNLP', 'Constituency']);
465
Akronce125b62017-06-19 11:54:36 +0200466# CMC
467push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100468
Akrone1dbc382016-07-08 22:24:52 +0200469# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100470my @dereko_attr = ();
471if ($base_sentences eq 'dereko#structure') {
472 push @dereko_attr, 'sentences';
473};
474if ($base_paragraphs eq 'dereko#structure') {
475 push @dereko_attr, 'paragraphs';
476};
Akron636bd9c2017-02-09 17:13:00 +0100477
Akron41ac10b2017-02-08 22:47:25 +0100478if ($base_pagebreaks eq 'dereko#structure') {
479 push @dereko_attr, 'pagebreaks';
480};
481
482if ($dereko_attr[0]) {
483 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100484}
485else {
486 push(@layers, ['DeReKo', 'Structure']);
487};
Akrone1dbc382016-07-08 22:24:52 +0200488
Akron57510c12019-01-04 14:58:53 +0100489# DGD
490push(@layers, ['DGD', 'Morpho']);
491
492# DRuKoLa
493push(@layers, ['DRuKoLa', 'Morpho']);
494
Akrone1dbc382016-07-08 22:24:52 +0200495# Glemm
496push(@layers, ['Glemm', 'Morpho']);
497
Akronea1aed52018-07-19 14:43:34 +0200498# HNC
499push(@layers, ['HNC', 'Morpho']);
500
Akron4c679192018-01-16 17:41:49 +0100501# LWC
502push(@layers, ['LWC', 'Dependency']);
503
Akrone1dbc382016-07-08 22:24:52 +0200504# Malt
505push(@layers, ['Malt', 'Dependency']);
506
Akron57510c12019-01-04 14:58:53 +0100507# Marmot
508push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200509
510# Mate
511push(@layers, ['Mate', 'Morpho']);
512push(@layers, ['Mate', 'Dependency']);
513
Akron57510c12019-01-04 14:58:53 +0100514# MDParser
515push(@layers, ['MDParser', 'Dependency']);
516
Akrone1dbc382016-07-08 22:24:52 +0200517# OpenNLP
518push(@layers, ['OpenNLP', 'Morpho']);
519push(@layers, ['OpenNLP', 'Sentences']);
520
521# Schreibgebrauch
522push(@layers, ['Sgbr', 'Lemma']);
523push(@layers, ['Sgbr', 'Morpho']);
524
525# TreeTagger
526push(@layers, ['TreeTagger', 'Morpho']);
527push(@layers, ['TreeTagger', 'Sentences']);
528
529# XIP
530push(@layers, ['XIP', 'Morpho']);
531push(@layers, ['XIP', 'Constituency']);
532push(@layers, ['XIP', 'Sentences']);
533push(@layers, ['XIP', 'Dependency']);
534
Akron4fa37c32017-01-20 14:43:10 +0100535
Akrone1dbc382016-07-08 22:24:52 +0200536# Check filters
537my @filtered_anno;
538if ($skip{'#all'}) {
539 foreach (@anno) {
540 push @filtered_anno, [ split('#', $_) ];
541 };
542}
543
544# Add all annotations that are not skipped
545else {
546 # Add to index file - respect skipping
547 foreach my $info (@layers) {
548 # Skip if Foundry or Foundry#Layer should be skipped
549 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
550 push @filtered_anno, $info;
551 };
552 };
553};
554
555# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200556my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
557
558# Remove file extension
559$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200560
561# TODO: This should not be initialized for batch
562my $cache = Cache::FastMmap->new(
563 share_file => $cache_file,
564 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200565 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200566);
567
Akron03b24db2016-08-16 20:54:32 +0200568# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200569my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200570 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200571 meta_type => $meta,
572 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200573 foundry => $token_base_foundry,
574 layer => $token_base_layer,
575 gzip => $gzip,
576 log => $log,
Akron263274c2019-02-07 09:48:30 +0100577 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200578 primary => $primary,
579 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100580 anno => \@filtered_anno,
581 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200582);
583
Akron941c1a62016-02-23 17:41:41 +0100584# Get file name based on path information
585sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100586 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200587 if (-d $i) {
588 $i =~ s![^\/]+$!!;
589 };
Akron941c1a62016-02-23 17:41:41 +0100590 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200591
592 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200593 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100594 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100595 $file =~ tr/\//-/;
596 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200597 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100598 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000599};
600
Akron63f20d42017-04-10 23:40:29 +0200601
602sub get_file_name_from_glob ($) {
603 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200604 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200605 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
606 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
607 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
608 $glob =~ s/^-//; # Clean beginning
609 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200610 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200611 return $glob;
612};
613
614
Akrone10ad322016-02-27 10:54:26 +0100615# Convert sigle to path construct
616s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
617
Akron7d4cdd82016-08-17 21:39:45 +0200618if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200619 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200620 $log->error("Directory '$output' does not exist.");
621 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200622 };
623};
624
Akron63f20d42017-04-10 23:40:29 +0200625
626# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200627if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200628
Akron821db3d2017-04-06 21:19:31 +0200629 my @new_input = ();
630
631 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200632 foreach my $wild_card (@input) {
633
634 # Prefix with input root
635 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
636
637 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200638 };
639
Akron63f20d42017-04-10 23:40:29 +0200640 # Sort files by length
641 @input = sort { length($a) <=> length($b) } @new_input;
642
643 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200644};
645
646
Akron941c1a62016-02-23 17:41:41 +0100647# Process a single file
648unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100649 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000650
Akron941c1a62016-02-23 17:41:41 +0100651 BEGIN {
652 $main::TIME = Benchmark->new;
653 $main::LAST_STOP = Benchmark->new;
654 };
655
656 sub stop_time {
657 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200658 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100659 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200660 timestr(timediff($new, $main::LAST_STOP)) .
661 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
662 );
Akron941c1a62016-02-23 17:41:41 +0100663 $main::LAST_STOP = $new;
664 };
665
666 # Create and parse new document
667 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100668
Akron7d4cdd82016-08-17 21:39:45 +0200669 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200670 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100671
Akron11c80302016-03-18 19:44:43 +0100672 # Delete cache file
673 unlink($cache_file) if $cache_delete;
674
Akron5f51d422016-08-16 16:26:43 +0200675 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200676 exit;
Akron81500102017-04-07 20:45:44 +0200677};
678
Nils Diewald59094f22014-11-05 18:20:50 +0000679
Akrone10ad322016-02-27 10:54:26 +0100680# Extract XML files
Akron81500102017-04-07 20:45:44 +0200681if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100682
Akrond5643ad2017-07-04 20:27:13 +0200683 # Output is required
684 pod2usage(%ERROR_HASH) unless $output;
685
Akron7d4cdd82016-08-17 21:39:45 +0200686 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200687 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100688
Akron7d4cdd82016-08-17 21:39:45 +0200689 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100690 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200691 $log->error("Unzip is not installed or incompatible.");
692 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100693 };
694
Akronb0c88db2016-06-29 16:33:18 +0200695 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200696 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200697
Akron31a08cb2019-02-20 20:43:26 +0100698 # Will set @sigle
699 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200700
Akrone10ad322016-02-27 10:54:26 +0100701 # Iterate over all given sigles and extract
702 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100703
Akron2812ba22016-10-28 21:55:59 +0200704 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200705
Akron03b24db2016-08-16 20:54:32 +0200706 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200707 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100708
Akron955b75b2019-02-21 14:28:41 +0100709 # TODO:
710 # - prefix???
711 $archive->extract_sigle([$_], $output, $jobs)
712 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200713 );
Akrone10ad322016-02-27 10:54:26 +0100714 print "extracted.\n";
715 };
Akronb0c88db2016-06-29 16:33:18 +0200716 }
Akron7d4cdd82016-08-17 21:39:45 +0200717
718 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200719 else {
720 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200721 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100722 };
723}
724
Akron81500102017-04-07 20:45:44 +0200725
Akron941c1a62016-02-23 17:41:41 +0100726# Process an archive
727elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000728
Akron81500102017-04-07 20:45:44 +0200729 my $archive_output;
730
731 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100732 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200733
734 # Create new archive object
735 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
736
737 # Check zip capabilities
738 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200739 $log->error("Unzip is not installed or incompatible.");
740 exit 1;
Akron81500102017-04-07 20:45:44 +0200741 };
742
743 # Add further annotation archived
744 $archive->attach($_) foreach @input[1..$#input];
745
746 # Create a temporary directory
747 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200748 $extract_dir = tempdir(CLEANUP => 0);
749 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200750 };
751
Akron63f20d42017-04-10 23:40:29 +0200752 # Add some random extra to avoid clashes with multiple archives
753 $extract_dir = catdir($extract_dir, random_string('cccccc'));
754
Akron31a08cb2019-02-20 20:43:26 +0100755 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200756 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200757 @input = ($extract_dir);
758 }
759 else {
760 $log->error('Unable to extract from primary archive ' . $input[0] .
761 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200762 exit 1;
Akron81500102017-04-07 20:45:44 +0200763 };
764 }
765
766 # Can't create archive object
767 else {
768 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200769 exit 1;
Akron81500102017-04-07 20:45:44 +0200770 };
771 };
772
Akron7d4cdd82016-08-17 21:39:45 +0200773 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100774 my $pool = Parallel::ForkManager->new($jobs);
775
Akron7d4cdd82016-08-17 21:39:45 +0200776 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100777 my $iter = 1; # Current text in process
778
Akronda3097e2017-04-23 19:53:57 +0200779 my $tar_archive;
780 my $output_dir = $output;
781 my $tar_fh;
782
783 # Initialize tar archive
784 if ($to_tar) {
785 $tar_archive = Archive::Tar::Builder->new(
786 ignore_errors => 1
787 );
788
789 # Set output name
790 my $tar_file = $output;
791 unless ($tar_file =~ /\.tar$/) {
792 $tar_file .= '.tar';
793 };
794
795 # Initiate the tar file
796 print "Writing to file $tar_file\n";
797 $tar_fh = IO::File->new($tar_file, 'w');
798 $tar_fh->binmode(1);
799
800 # Set handle
801 $tar_archive->set_handle($tar_fh);
802
803 # Output to temporary directory
804 $output_dir = File::Temp->newdir;
805 };
806
Akron941c1a62016-02-23 17:41:41 +0100807 # Report on fork message
808 $pool->run_on_finish (
809 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200810 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100811 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200812
Akron08385f62016-03-22 20:37:04 +0100813 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200814 ($iter++) . "/$count]" .
815 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200816 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200817
818 if (!$code && $to_tar && $data->[2]) {
819 my $filename = $data->[2];
820
821 # Lock filehandle
822 if (flock($tar_fh, LOCK_EX)) {
823
Akron9a062ce2017-07-04 19:12:05 +0200824 my $clean_file = fileparse($filename);
825
Akronda3097e2017-04-23 19:53:57 +0200826 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200827 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200828 unlink $filename;
829
830 # Unlock filehandle
831 flock($tar_fh, LOCK_UN);
832 }
833 else {
834 $log->warn("Unable to add $filename to archive");
835 };
836 };
837
Akron4c0cf312016-10-15 16:42:09 +0200838 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100839 }
840 );
841
842 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200843 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100844 print "Reading data ...\n";
845
Akron7d4cdd82016-08-17 21:39:45 +0200846 # unless (Cache::FastMmap->new(
847 # share_file => $cache_file,
848 # cache_size => $cache_size,
849 # init_file => $cache_init
850 # )) {
851 # print "Unable to intialize cache '$cache_file'\n\n";
852 # exit(1);
853 # };
Akron11c80302016-03-18 19:44:43 +0100854
Akron486f9ab2017-04-22 23:25:19 +0200855
Akron941c1a62016-02-23 17:41:41 +0100856 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100857 if (-d $input[0]) {
858 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100859 my @dirs;
860 my $dir;
861
Akron7d4cdd82016-08-17 21:39:45 +0200862 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100863 while (1) {
864 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200865 push @dirs, $dir;
866 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100867 };
868 last unless $it->next;
869 };
870
871 print "Start processing ...\n";
872 $t = Benchmark->new;
873 $count = scalar @dirs;
874
875 DIRECTORY_LOOP:
876 for (my $i = 0; $i < $count; $i++) {
877
Akrone1dbc382016-07-08 22:24:52 +0200878 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200879 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200880 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200881 );
Akron941c1a62016-02-23 17:41:41 +0100882
883 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200884 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200885
Akron13d56622016-10-31 14:54:49 +0100886 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200887 $pool->finish(
888 0,
Akronda3097e2017-04-23 19:53:57 +0200889 [
890 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
891 undef,
892 $filename
893 ]
Akron486f9ab2017-04-22 23:25:19 +0200894 );
Akron3ec48972016-08-17 23:24:52 +0200895 }
896 else {
Akron4c0cf312016-10-15 16:42:09 +0200897 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200898 };
Akron941c1a62016-02-23 17:41:41 +0100899 };
900 }
901
902 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200903 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200904
Akron941c1a62016-02-23 17:41:41 +0100905 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200906 $log->error("Unzip is not installed or incompatible.");
907 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100908 };
909
Akron08385f62016-03-22 20:37:04 +0100910 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200911 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100912
Akron31a08cb2019-02-20 20:43:26 +0100913 # Get sigles to extract
914 my $prefix = set_sigle($archive);
915
Akron941c1a62016-02-23 17:41:41 +0100916 print "Start processing ...\n";
917 $t = Benchmark->new;
918 my @dirs = $archive->list_texts;
919 $count = scalar @dirs;
920
921 ARCHIVE_LOOP:
922 for (my $i = 0; $i < $count; $i++) {
923
924 # Split path information
925 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
926
Akrone1dbc382016-07-08 22:24:52 +0200927 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200928 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200929 get_file_name(
930 catfile($corpus, $doc, $text)
931 . '.json' . ($gzip ? '.gz' : '')
932 )
Akrone1dbc382016-07-08 22:24:52 +0200933 );
Akron941c1a62016-02-23 17:41:41 +0100934
935 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200936 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100937
Akron4c0cf312016-10-15 16:42:09 +0200938 # Create temporary file
939 $temp = File::Temp->newdir;
940
Akronbdf434a2016-10-24 17:42:07 +0200941 # TODO: Check if $filename exist at the beginning,
942 # because extraction can be horrible slow!
943
Akron941c1a62016-02-23 17:41:41 +0100944 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100945 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100946
Akron7d4cdd82016-08-17 21:39:45 +0200947 # Create corpus directory
948 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100949
Akron7d4cdd82016-08-17 21:39:45 +0200950 # Temporary directory
951 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100952
Akron7d4cdd82016-08-17 21:39:45 +0200953 # Write file
Akron13d56622016-10-31 14:54:49 +0100954 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200955
Akron4c0cf312016-10-15 16:42:09 +0200956 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100957 $pool->finish(
958 0,
Akronda3097e2017-04-23 19:53:57 +0200959 [
960 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
961 $temp,
962 $filename
963 ]
Akron13d56622016-10-31 14:54:49 +0100964 );
965 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200966 }
967 else {
Akron4c0cf312016-10-15 16:42:09 +0200968 # Delete temporary file
969 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200970 };
Akron941c1a62016-02-23 17:41:41 +0100971 }
Akron7d4cdd82016-08-17 21:39:45 +0200972
973 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100974 else {
Akron4c0cf312016-10-15 16:42:09 +0200975 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100976 };
977 };
978 }
979
980 else {
981 print "Input is neither a directory nor an archive.\n\n";
982 };
983
984 $pool->wait_all_children;
985
Akron11c80302016-03-18 19:44:43 +0100986 # Delete cache file
987 unlink($cache_file) if $cache_delete;
988
Akronda3097e2017-04-23 19:53:57 +0200989 # Close tar filehandle
990 if ($to_tar && $tar_fh) {
991 $tar_archive->finish;
992 $tar_fh->close;
993 print "Wrote to tar archive.\n";
994 };
995
Akron63f20d42017-04-10 23:40:29 +0200996 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100997 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200998};
Akron941c1a62016-02-23 17:41:41 +0100999
Nils Diewald2db9ad02013-10-29 19:26:43 +00001000
Akron31a08cb2019-02-20 20:43:26 +01001001# For an archive, this will create the list
1002# of all sigles to process
1003sub set_sigle {
1004 my $archive = shift;
1005
1006 my $prefix = 1;
1007 my @dirs = ();
1008
1009 # No sigles given
1010 unless (@sigle) {
1011
1012 # Get files
1013 foreach ($archive->list_texts) {
1014
1015 push @dirs, $_;
1016
1017 # Split path information
1018 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1019
1020 # TODO: Make this OS independent
1021 push @sigle, join '/', $corpus, $doc, $text;
1022 };
1023 }
1024
1025 # Check sigle for doc sigles
1026 else {
1027 my @new_sigle;
1028
1029 my $prefix_check = 0;
1030
1031 # Iterate over all sigle
1032 foreach (@sigle) {
1033
1034 # Sigle is a doc sigle
1035 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1036
1037 print "$_ ...";
1038 # Check if a prefix is needed
1039 unless ($prefix_check) {
1040
1041 if ($prefix = $archive->check_prefix) {
1042 print " with prefix ...";
1043 };
1044 $prefix_check = 1;
1045 };
1046
1047 print "\n";
1048
Akron31a08cb2019-02-20 20:43:26 +01001049 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001050 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1051 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001052 );
1053 print "extracted.\n";
1054 }
1055
1056 # Sigle is a text sigle
1057 else {
1058 push @new_sigle, $_;
1059
1060 unless ($prefix_check) {
1061
1062 if ($prefix = $archive->check_prefix) {
1063 print " with prefix ...";
1064 };
1065 $prefix_check = 1;
1066 };
1067 };
1068 };
1069 @sigle = @new_sigle;
1070 };
1071
1072 return $prefix;
1073};
1074
1075
1076
Akron63f20d42017-04-10 23:40:29 +02001077# Cleanup temporary extraction directory
1078if ($extract_dir) {
1079 my $objects = remove_tree($extract_dir, { safe => 1 });
1080 print "Removed directory $extract_dir with $objects objects.\n";
1081};
1082
1083
1084print "\n";
1085
Nils Diewald2db9ad02013-10-29 19:26:43 +00001086__END__
Akron941c1a62016-02-23 17:41:41 +01001087
1088=pod
1089
1090=encoding utf8
1091
1092=head1 NAME
1093
Akronf7ad89e2016-03-16 18:22:47 +01001094korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001095
1096
1097=head1 SYNOPSIS
1098
Akrona76d8352016-10-27 16:27:32 +02001099 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001100
Akron2fd402b2016-10-27 21:26:48 +02001101
Akron941c1a62016-02-23 17:41:41 +01001102=head1 DESCRIPTION
1103
1104L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1105compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001106The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001107
1108
1109=head1 INSTALLATION
1110
1111The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1112
Akronaf386982016-10-12 00:33:25 +02001113 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001114
Akronc13a1702016-03-15 19:33:14 +01001115In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001116be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001117Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001118In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001119
1120=head1 ARGUMENTS
1121
Akrona76d8352016-10-27 16:27:32 +02001122 $ korapxml2krill -z --input <directory> --output <filename>
1123
1124Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001125It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001126
Akron941c1a62016-02-23 17:41:41 +01001127=over 2
1128
1129=item B<archive>
1130
Akron081639e2017-04-21 19:01:39 +02001131 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001132
Akron2fd402b2016-10-27 21:26:48 +02001133Converts an archive of KorAP-XML documents. It expects a directory
1134(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001135
1136=item B<extract>
1137
Akrona76d8352016-10-27 16:27:32 +02001138 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1139
1140Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001141
Akron63f20d42017-04-10 23:40:29 +02001142=item B<serial>
1143
1144 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1145
1146Convert archives sequentially. The inputs are not merged but treated
1147as they are (so they may be premerged or globs).
1148the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001149are created based on the archive name. In case the C<--to-tar> flag is given,
1150the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001151
1152
Akron941c1a62016-02-23 17:41:41 +01001153=back
1154
1155
1156=head1 OPTIONS
1157
1158=over 2
1159
Akrona76d8352016-10-27 16:27:32 +02001160=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001161
Akrona76d8352016-10-27 16:27:32 +02001162Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001163
Akron7606afa2016-10-25 16:23:49 +02001164Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001165document, while C<archive> expects a KorAP-XML corpus folder or a zip
1166file to batch process multiple files.
1167C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001168
Akrona76d8352016-10-27 16:27:32 +02001169C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001170that the first archive listed contains all primary data files
1171and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001172
Akron7606afa2016-10-25 16:23:49 +02001173 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001174
Akron821db3d2017-04-06 21:19:31 +02001175Input may also be defined using BSD glob wildcards.
1176
1177 -i 'file/news*.zip'
1178
1179The extended input array will be sorted in length order, so the shortest
1180path needs to contain all primary data files and all meta data files.
1181
Akron0c3e3752016-06-28 15:55:53 +02001182(The directory structure follows the base directory format,
1183that may include a C<.> root folder.
1184In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001185need to be passed with a hash sign in front of the archive's name.
1186This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001187
Akron7606afa2016-10-25 16:23:49 +02001188To support zip files, a version of C<unzip> needs to be installed that is
1189compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001190
Akron7606afa2016-10-25 16:23:49 +02001191B<The root folder switch using the hash sign is experimental and
1192may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001193
Akronf73ffb62018-06-27 12:13:59 +02001194
Akron63f20d42017-04-10 23:40:29 +02001195=item B<--input-base|-ib> <directory>
1196
1197The base directory for inputs.
1198
1199
Akron941c1a62016-02-23 17:41:41 +01001200=item B<--output|-o> <directory|file>
1201
1202Output folder for archive processing or
1203document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001204writes to C<STDOUT> by default
1205(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001206
1207=item B<--overwrite|-w>
1208
1209Overwrite files that already exist.
1210
Akronf73ffb62018-06-27 12:13:59 +02001211
Akron3741f8b2016-12-21 19:55:21 +01001212=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001213
1214Define the default tokenization by specifying
1215the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001216of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001217
Akron3741f8b2016-12-21 19:55:21 +01001218
1219=item B<--base-sentences|-bs> <foundry>#<layer>
1220
1221Define the layer for base sentences.
1222If given, this will be used instead of using C<Base#Sentences>.
1223Currently C<DeReKo#Structure> is the only additional layer supported.
1224
1225 Defaults to unset.
1226
1227
1228=item B<--base-paragraphs|-bp> <foundry>#<layer>
1229
1230Define the layer for base paragraphs.
1231If given, this will be used instead of using C<Base#Paragraphs>.
1232Currently C<DeReKo#Structure> is the only additional layer supported.
1233
1234 Defaults to unset.
1235
1236
Akron41ac10b2017-02-08 22:47:25 +01001237=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1238
1239Define the layer for base pagebreaks.
1240Currently C<DeReKo#Structure> is the only layer supported.
1241
1242 Defaults to unset.
1243
1244
Akron941c1a62016-02-23 17:41:41 +01001245=item B<--skip|-s> <foundry>[#<layer>]
1246
Akronf7ad89e2016-03-16 18:22:47 +01001247Skip specific annotations by specifying the foundry
1248(and optionally the layer with a C<#>-prefix),
1249e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001250Can be set multiple times.
1251
Akronf73ffb62018-06-27 12:13:59 +02001252
Akronc13a1702016-03-15 19:33:14 +01001253=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001254
Akronf7ad89e2016-03-16 18:22:47 +01001255Convert specific annotations by specifying the foundry
1256(and optionally the layer with a C<#>-prefix),
1257e.g. C<Mate> or C<Mate#Morpho>.
1258Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001259
Akronf73ffb62018-06-27 12:13:59 +02001260
Akron941c1a62016-02-23 17:41:41 +01001261=item B<--primary|-p>
1262
Akronc13a1702016-03-15 19:33:14 +01001263Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001264Can be flagged using C<--no-primary> as well.
1265This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001266
Akronf73ffb62018-06-27 12:13:59 +02001267
Akroned9baf02019-01-22 17:03:25 +01001268=item B<--non-word-tokens|-nwt>
1269
1270Tokenize non-word tokens like word tokens (defined as matching
1271C</[\d\w]/>). Useful to treat punctuations as tokens.
1272
1273 Defaults to unset.
1274
Akron941c1a62016-02-23 17:41:41 +01001275=item B<--jobs|-j>
1276
1277Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001278for archive processing.
Akron11c80302016-03-18 19:44:43 +01001279Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001280
1281If C<sequential-extraction> is not set to false, this will
1282also apply to extraction.
1283
Akronc11f7982017-02-21 21:20:14 +01001284Pass -1, and the value will be set automatically to 5
1285times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001286This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001287
Akronf73ffb62018-06-27 12:13:59 +02001288
Akron263274c2019-02-07 09:48:30 +01001289=item B<--koral|-k>
1290
1291Version of the output format. Supported versions are:
1292C<0> for legacy serialization, C<0.03> for serialization
1293with metadata fields as key-values on the root object,
1294C<0.4> for serialization with metadata fields as a list
1295of C<"@type":"koral:field"> objects.
1296
1297Currently defaults to C<0.03>.
1298
1299
Akron9ec88872017-04-12 16:29:06 +02001300=item B<--sequential-extraction|-se>
1301
1302Flag to indicate, if the C<jobs> value also applies to extraction.
1303Some systems may have problems with extracting multiple archives
1304to the same folder at the same time.
1305Can be flagged using C<--no-sequential-extraction> as well.
1306Defaults to C<false>.
1307
Akronf73ffb62018-06-27 12:13:59 +02001308
Akron35db6e32016-03-17 22:42:22 +01001309=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001310
Akron35db6e32016-03-17 22:42:22 +01001311Define the metadata parser to use. Defaults to C<I5>.
1312Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1313This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001314
Akronf73ffb62018-06-27 12:13:59 +02001315
Akron941c1a62016-02-23 17:41:41 +01001316=item B<--pretty|-y>
1317
Akronc13a1702016-03-15 19:33:14 +01001318Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001319This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001320
Akronf73ffb62018-06-27 12:13:59 +02001321
Akron941c1a62016-02-23 17:41:41 +01001322=item B<--gzip|-z>
1323
Akronf7ad89e2016-03-16 18:22:47 +01001324Compress the output.
1325Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001326
Akronf73ffb62018-06-27 12:13:59 +02001327
Akron11c80302016-03-18 19:44:43 +01001328=item B<--cache|-c>
1329
1330File to mmap a cache (using L<Cache::FastMmap>).
1331Defaults to C<korapxml2krill.cache> in the calling directory.
1332
Akronf73ffb62018-06-27 12:13:59 +02001333
Akron11c80302016-03-18 19:44:43 +01001334=item B<--cache-size|-cs>
1335
1336Size of the cache. Defaults to C<50m>.
1337
Akronf73ffb62018-06-27 12:13:59 +02001338
Akron11c80302016-03-18 19:44:43 +01001339=item B<--cache-init|-ci>
1340
1341Initialize cache file.
1342Can be flagged using C<--no-cache-init> as well.
1343Defaults to C<true>.
1344
Akronf73ffb62018-06-27 12:13:59 +02001345
Akron11c80302016-03-18 19:44:43 +01001346=item B<--cache-delete|-cd>
1347
1348Delete cache file after processing.
1349Can be flagged using C<--no-cache-delete> as well.
1350Defaults to C<true>.
1351
Akronf73ffb62018-06-27 12:13:59 +02001352
Akron636aa112017-04-07 18:48:56 +02001353=item B<--config|-cfg>
1354
1355Configure the parameters of your call in a file
1356of key-value pairs with whitespace separator
1357
1358 overwrite 1
1359 token DeReKo#Structure
1360 ...
1361
1362Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001363C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001364C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001365C<output>, C<koral>,
1366C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001367C<base-sentences>, C<base-paragraphs>,
1368C<base-pagebreaks>,
1369C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001370(semicolon separated), C<anno> (semicolon separated).
1371
Akronf73ffb62018-06-27 12:13:59 +02001372Configuration parameters will always be overwritten by
1373passed parameters.
1374
1375
Akron81500102017-04-07 20:45:44 +02001376=item B<--temporary-extract|-te>
1377
1378Only valid for the C<archive> command.
1379
1380This will first extract all files into a
1381directory and then will archive.
1382If the directory is given as C<:temp:>,
1383a temporary directory is used.
1384This is especially useful to avoid
1385massive unzipping and potential
1386network latency.
Akron636aa112017-04-07 18:48:56 +02001387
Akronf73ffb62018-06-27 12:13:59 +02001388
Akronc93a0802019-07-11 15:48:34 +02001389=item B<--to-tar>
1390
1391Only valid for the C<archive> command.
1392
1393Writes the output into a tar archive.
1394
1395
Akrone10ad322016-02-27 10:54:26 +01001396=item B<--sigle|-sg>
1397
Akron20807582016-10-26 17:11:34 +02001398Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001399Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001400I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001401Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001402In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001403On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001404
Akronf73ffb62018-06-27 12:13:59 +02001405
Akron941c1a62016-02-23 17:41:41 +01001406=item B<--log|-l>
1407
1408The L<Log4perl> log level, defaults to C<ERROR>.
1409
Akronf73ffb62018-06-27 12:13:59 +02001410
Akron941c1a62016-02-23 17:41:41 +01001411=item B<--help|-h>
1412
1413Print this document.
1414
Akronf73ffb62018-06-27 12:13:59 +02001415
Akron941c1a62016-02-23 17:41:41 +01001416=item B<--version|-v>
1417
1418Print version information.
1419
1420=back
1421
Akronf73ffb62018-06-27 12:13:59 +02001422
Akronc13a1702016-03-15 19:33:14 +01001423=head1 ANNOTATION SUPPORT
1424
1425L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1426developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1427The base foundry with paragraphs, sentences, and the text element are mandatory for
1428L<Krill|https://github.com/KorAP/Krill>.
1429
Akron821db3d2017-04-06 21:19:31 +02001430 Base
1431 #Paragraphs
1432 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001433
Akron821db3d2017-04-06 21:19:31 +02001434 Connexor
1435 #Morpho
1436 #Phrase
1437 #Sentences
1438 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001439
Akron821db3d2017-04-06 21:19:31 +02001440 CoreNLP
1441 #Constituency
1442 #Morpho
1443 #NamedEntities
1444 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001445
Akronce125b62017-06-19 11:54:36 +02001446 CMC
1447 #Morpho
1448
Akron821db3d2017-04-06 21:19:31 +02001449 DeReKo
1450 #Structure
Akronc13a1702016-03-15 19:33:14 +01001451
Akron57510c12019-01-04 14:58:53 +01001452 DGD
1453 #Morpho
1454
Akron821db3d2017-04-06 21:19:31 +02001455 DRuKoLa
1456 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001457
Akron821db3d2017-04-06 21:19:31 +02001458 Glemm
1459 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001460
Akronea1aed52018-07-19 14:43:34 +02001461 HNC
1462 #Morpho
1463
Akron4c679192018-01-16 17:41:49 +01001464 LWC
1465 #Dependency
1466
Akron821db3d2017-04-06 21:19:31 +02001467 Malt
1468 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001469
Akron821db3d2017-04-06 21:19:31 +02001470 MarMoT
1471 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001472
Akron821db3d2017-04-06 21:19:31 +02001473 Mate
1474 #Dependency
1475 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001476
Akron821db3d2017-04-06 21:19:31 +02001477 MDParser
1478 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001479
Akron821db3d2017-04-06 21:19:31 +02001480 OpenNLP
1481 #Morpho
1482 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001483
Akron821db3d2017-04-06 21:19:31 +02001484 Sgbr
1485 #Lemma
1486 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001487
Akron821db3d2017-04-06 21:19:31 +02001488 TreeTagger
1489 #Morpho
1490 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001491
Akron821db3d2017-04-06 21:19:31 +02001492 XIP
1493 #Constituency
1494 #Morpho
1495 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001496
Akronc13a1702016-03-15 19:33:14 +01001497
1498More importers are in preparation.
1499New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1500See the built-in annotation importers as examples.
1501
Akronf73ffb62018-06-27 12:13:59 +02001502
Akron941c1a62016-02-23 17:41:41 +01001503=head1 AVAILABILITY
1504
1505 https://github.com/KorAP/KorAP-XML-Krill
1506
1507
1508=head1 COPYRIGHT AND LICENSE
1509
Akroned9baf02019-01-22 17:03:25 +01001510Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001511
Akron941c1a62016-02-23 17:41:41 +01001512Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001513
Akrona76d8352016-10-27 16:27:32 +02001514Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001515
1516L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1517Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001518L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001519member of the
1520L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1521
1522This program is free software published under the
1523L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1524
1525=cut