blob: e8aded27f0e85aedf4cb862f378f33c20aba5482 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akron941c1a62016-02-23 17:41:41 +0100145# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100146
Akron7d5e6382019-08-08 16:36:27 +0200147our $LAST_CHANGE = '2019/08/08';
Akron941c1a62016-02-23 17:41:41 +0100148our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100149our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100150our $VERSION_MSG = <<"VERSION";
151Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
152VERSION
153
Akron63f20d42017-04-10 23:40:29 +0200154# Prototypes
155sub get_file_name_from_glob($);
156sub get_file_name($);
157
Akron941c1a62016-02-23 17:41:41 +0100158# Parse comand
159my $cmd;
160our @ARGV;
161if ($ARGV[0] && index($ARGV[0], '-') != 0) {
162 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100163};
Akron63f20d42017-04-10 23:40:29 +0200164my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100165
Akron5f51d422016-08-16 16:26:43 +0200166my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100167my $text;
Akrone10ad322016-02-27 10:54:26 +0100168
Akron941c1a62016-02-23 17:41:41 +0100169# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000170GetOptions(
Akron08385f62016-03-22 20:37:04 +0100171 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200172 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100173 'output|o=s' => \(my $output),
174 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100175 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200176 'token|t=s' => \(my $token_base),
177 'base-sentences|bs=s' => \(my $base_sentences),
178 'base-paragraphs|bp=s' => \(my $base_paragraphs),
179 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100180 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200181 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100182 'skip|s=s' => \@skip,
183 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200184 'cache|c=s' => \(my $cache_file),
185 'config|cfg=s' => \(my $cfg_file),
186 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200187 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100188 'primary|p!' => \(my $primary),
189 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200190 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100191 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200192 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100193 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200194 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200195 'cache-size|cs=s' => \(my $cache_size),
196 'cache-delete|cd!' => \(my $cache_delete),
197 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100198 'help|h' => sub {
199 pod2usage(
200 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200201 -verbose => 99,
202 -msg => $VERSION_MSG,
203 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100204 );
205 },
206 'version|v' => sub {
207 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200208 -verbose => 0,
209 -msg => $VERSION_MSG,
210 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100211 )
212 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000213);
214
Akron63f20d42017-04-10 23:40:29 +0200215
Akron636aa112017-04-07 18:48:56 +0200216# Load from configuration
217if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200218 my %config;
219
220 Config::Simple->import_from($cfg_file, \%config);
221
222 # Overwrite
223 if (!defined($overwrite) && defined $config{overwrite}) {
224 $overwrite = $config{overwrite};
225 };
226
227 # Gzip
228 if (!defined($gzip) && defined $config{gzip}) {
229 $gzip = $config{gzip};
230 };
231
232 # Jobs
233 if (!defined($jobs) && defined $config{jobs}) {
234 $jobs = $config{jobs};
235 };
236
Akron263274c2019-02-07 09:48:30 +0100237 # Koral version
238 if (!defined($koral) && defined $config{koral}) {
239 $koral = $config{koral};
240 };
241
Akron63f20d42017-04-10 23:40:29 +0200242 # Input root base directory
243 if (!defined($input_base) && defined $config{'input-base'}) {
244 $input_base = $config{'input-base'};
245 };
246
Akron81500102017-04-07 20:45:44 +0200247 # temporary-extract
248 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
249 $extract_dir = $config{'temporary-extract'};
250 };
251
Akron636aa112017-04-07 18:48:56 +0200252 # Token base
253 if (!defined($token_base) && defined $config{token}) {
254 $token_base = $config{token};
255 };
256
Akroned9baf02019-01-22 17:03:25 +0100257 # temporary-extract
258 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
259 $non_word_tokens = $config{'non-word-tokens'};
260 };
261
Akron636aa112017-04-07 18:48:56 +0200262 # Cache file
263 if (!defined($cache_file) && defined $config{cache}) {
264 $cache_file = $config{cache};
265 };
266
267 # Cache size
268 if (!defined($cache_size) && defined $config{'cache-size'}) {
269 $cache_size = $config{'cache-size'};
270 };
271
272 # Cache delete
273 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
274 $cache_delete = $config{'cache-delete'} ;
275 };
276
277 # Cache init
278 if (!(defined $cache_init) && defined $config{'cache-init'}) {
279 $cache_init = $config{'cache-init'} ;
280 };
281
Akron9ec88872017-04-12 16:29:06 +0200282 # Jobs for extraction
283 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
284 $sequential_extraction = $config{'sequential-extraction'} ;
285 };
286
Akron636aa112017-04-07 18:48:56 +0200287 # Meta
288 if (!(defined $meta) && defined $config{'meta'}) {
289 $meta = $config{'meta'} ;
290 };
291
292 # Output
293 if (!(defined $output) && defined $config{'output'}) {
294 $output = $config{'output'} ;
295 };
296
297 # Base-sentences
298 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
299 $base_sentences = $config{'base-sentences'} ;
300 };
301
302 # Base-paragraphs
303 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
304 $base_paragraphs = $config{'base-paragraphs'} ;
305 };
306
307 # Base-pagebreaks
308 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
309 $base_pagebreaks = $config{'base-pagebreaks'} ;
310 };
311
Akron081639e2017-04-21 19:01:39 +0200312 # Write to tar
313 if (!(defined $to_tar) && defined $config{'to-tar'}) {
314 $to_tar = $config{'to-tar'} ;
315 };
316
Akron636aa112017-04-07 18:48:56 +0200317 # Log
318 if (!(defined $log_level) && defined $config{'log'}) {
319 $log_level = $config{'log'} ;
320 };
321
322 # Skip
323 if (!scalar(@skip) && defined $config{'skip'}) {
324 @skip = split /\s*;\s*/, $config{'skip'} ;
325 };
326
327 # Sigle
328 if (!scalar(@sigle) && defined $config{'sigle'}) {
329 @sigle = split /\s*;\s*/, $config{'sigle'} ;
330 };
331
332 # Anno
333 if (!scalar(@anno) && defined $config{'anno'}) {
334 @anno = split /\s*;\s*/, $config{'anno'} ;
335 };
336};
337
Akron63f20d42017-04-10 23:40:29 +0200338
Akron636aa112017-04-07 18:48:56 +0200339# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200340$token_base //= 'OpenNLP#tokens';
341$cache_file //= 'korapxml2krill.cache';
342$cache_size //= '50m';
343$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100344$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200345$cache_delete //= 1;
346$cache_init //= 1;
347$sequential_extraction //= 0;
348$log_level //= 'ERROR';
349$base_sentences //= '';
350$base_paragraphs //= '';
351$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100352$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200353
Akron821db3d2017-04-06 21:19:31 +0200354$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100355$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100356$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100357
Akron63f20d42017-04-10 23:40:29 +0200358
359# Initialize log4perl object
360Log::Log4perl->init({
361 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
362 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
363 'log4perl.appender.STDERR.layout' => 'PatternLayout',
364 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
365});
366
367my $log = Log::Log4perl->get_logger('main');
368
369
370print "Reading config from $cfg_file\n" if $cfg_file;
371
372
Akron941c1a62016-02-23 17:41:41 +0100373my %ERROR_HASH = (
374 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200375 -verbose => 99,
376 -msg => $VERSION_MSG,
377 -output => '-',
378 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100379);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000380
Akron941c1a62016-02-23 17:41:41 +0100381# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100382pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000383
Akrone1dbc382016-07-08 22:24:52 +0200384# Gzip has no effect, if no output is given
385pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000386
Akronc11f7982017-02-21 21:20:14 +0100387
Akron636aa112017-04-07 18:48:56 +0200388if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100389 state $cores = Sys::Info->new->device('CPU')->count;
390 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200391 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100392};
393
Akron821db3d2017-04-06 21:19:31 +0200394
Akron63f20d42017-04-10 23:40:29 +0200395# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200396if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200397
Akron486f9ab2017-04-22 23:25:19 +0200398 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200399 $log->error("Directory '$output' does not exist.");
400 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200401 };
402
403 # Remove all inputs
404 my $remove_next = 0;
405 @keep_argv = @{c(@keep_argv)->grep(
406 sub {
407 # Input flag
408 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
409 $remove_next = 1;
410 return 0;
411 }
412
413 # input value
414 elsif ($remove_next) {
415 $remove_next = 0;
416 return 0;
417 };
418
419 # Pass parameter
420 return 1;
421 }
422 )->to_array};
423
424
425 # Iterate over all inputs
426 foreach (@input) {
427
Akron081639e2017-04-21 19:01:39 +0200428 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200429 my $new_out = catdir($output, get_file_name_from_glob($_));
430
Akron486f9ab2017-04-22 23:25:19 +0200431 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200432 unless ($to_tar) {
433 if (make_path($new_out) == 0 && !-d $new_out) {
434 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200435 exit 1;
Akron081639e2017-04-21 19:01:39 +0200436 };
Akron63f20d42017-04-10 23:40:29 +0200437 };
438
439 # Create archive command
440 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
441 print "Start serial processing of $_ to $new_out\n";
442
443 # Start archiving
444 system @archive_cmd;
445 };
446
Akron3abc03e2017-06-29 16:23:35 +0200447 exit;
Akron63f20d42017-04-10 23:40:29 +0200448};
449
Akrone1dbc382016-07-08 22:24:52 +0200450my %skip;
451$skip{lc($_)} = 1 foreach @skip;
452
453my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100454push(@layers, ['Base', 'Sentences']) unless $base_sentences;
455push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200456
457# Connexor
458push(@layers, ['Connexor', 'Morpho']);
459push(@layers, ['Connexor', 'Syntax']);
460push(@layers, ['Connexor', 'Phrase']);
461push(@layers, ['Connexor', 'Sentences']);
462
463# CoreNLP
464push(@layers, ['CoreNLP', 'NamedEntities']);
465push(@layers, ['CoreNLP', 'Sentences']);
466push(@layers, ['CoreNLP', 'Morpho']);
467push(@layers, ['CoreNLP', 'Constituency']);
468
Akronce125b62017-06-19 11:54:36 +0200469# CMC
470push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100471
Akrone1dbc382016-07-08 22:24:52 +0200472# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100473my @dereko_attr = ();
474if ($base_sentences eq 'dereko#structure') {
475 push @dereko_attr, 'sentences';
476};
477if ($base_paragraphs eq 'dereko#structure') {
478 push @dereko_attr, 'paragraphs';
479};
Akron636bd9c2017-02-09 17:13:00 +0100480
Akron41ac10b2017-02-08 22:47:25 +0100481if ($base_pagebreaks eq 'dereko#structure') {
482 push @dereko_attr, 'pagebreaks';
483};
484
485if ($dereko_attr[0]) {
486 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100487}
488else {
489 push(@layers, ['DeReKo', 'Structure']);
490};
Akrone1dbc382016-07-08 22:24:52 +0200491
Akron57510c12019-01-04 14:58:53 +0100492# DGD
493push(@layers, ['DGD', 'Morpho']);
494
495# DRuKoLa
496push(@layers, ['DRuKoLa', 'Morpho']);
497
Akrone1dbc382016-07-08 22:24:52 +0200498# Glemm
499push(@layers, ['Glemm', 'Morpho']);
500
Akronea1aed52018-07-19 14:43:34 +0200501# HNC
502push(@layers, ['HNC', 'Morpho']);
503
Akron4c679192018-01-16 17:41:49 +0100504# LWC
505push(@layers, ['LWC', 'Dependency']);
506
Akrone1dbc382016-07-08 22:24:52 +0200507# Malt
508push(@layers, ['Malt', 'Dependency']);
509
Akron57510c12019-01-04 14:58:53 +0100510# Marmot
511push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200512
513# Mate
514push(@layers, ['Mate', 'Morpho']);
515push(@layers, ['Mate', 'Dependency']);
516
Akron57510c12019-01-04 14:58:53 +0100517# MDParser
518push(@layers, ['MDParser', 'Dependency']);
519
Akrone1dbc382016-07-08 22:24:52 +0200520# OpenNLP
521push(@layers, ['OpenNLP', 'Morpho']);
522push(@layers, ['OpenNLP', 'Sentences']);
523
524# Schreibgebrauch
525push(@layers, ['Sgbr', 'Lemma']);
526push(@layers, ['Sgbr', 'Morpho']);
527
Akron7d5e6382019-08-08 16:36:27 +0200528# Talismane
529push(@layers, ['Talismane', 'Dependency']);
530push(@layers, ['Talismane', 'Morpho']);
531
Akrone1dbc382016-07-08 22:24:52 +0200532# TreeTagger
533push(@layers, ['TreeTagger', 'Morpho']);
534push(@layers, ['TreeTagger', 'Sentences']);
535
536# XIP
537push(@layers, ['XIP', 'Morpho']);
538push(@layers, ['XIP', 'Constituency']);
539push(@layers, ['XIP', 'Sentences']);
540push(@layers, ['XIP', 'Dependency']);
541
Akron4fa37c32017-01-20 14:43:10 +0100542
Akrone1dbc382016-07-08 22:24:52 +0200543# Check filters
544my @filtered_anno;
545if ($skip{'#all'}) {
546 foreach (@anno) {
547 push @filtered_anno, [ split('#', $_) ];
548 };
549}
550
551# Add all annotations that are not skipped
552else {
553 # Add to index file - respect skipping
554 foreach my $info (@layers) {
555 # Skip if Foundry or Foundry#Layer should be skipped
556 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
557 push @filtered_anno, $info;
558 };
559 };
560};
561
562# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200563my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
564
565# Remove file extension
566$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200567
568# TODO: This should not be initialized for batch
569my $cache = Cache::FastMmap->new(
570 share_file => $cache_file,
571 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200572 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200573);
574
Akron03b24db2016-08-16 20:54:32 +0200575# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200576my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200577 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200578 meta_type => $meta,
579 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200580 foundry => $token_base_foundry,
581 layer => $token_base_layer,
582 gzip => $gzip,
583 log => $log,
Akron263274c2019-02-07 09:48:30 +0100584 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200585 primary => $primary,
586 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100587 anno => \@filtered_anno,
588 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200589);
590
Akron941c1a62016-02-23 17:41:41 +0100591# Get file name based on path information
592sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100593 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200594 if (-d $i) {
595 $i =~ s![^\/]+$!!;
596 };
Akron941c1a62016-02-23 17:41:41 +0100597 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200598
599 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200600 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100601 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100602 $file =~ tr/\//-/;
603 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200604 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100605 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000606};
607
Akron63f20d42017-04-10 23:40:29 +0200608
609sub get_file_name_from_glob ($) {
610 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200611 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200612 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
613 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
614 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
615 $glob =~ s/^-//; # Clean beginning
616 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200617 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200618 return $glob;
619};
620
621
Akrone10ad322016-02-27 10:54:26 +0100622# Convert sigle to path construct
623s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
624
Akron7d4cdd82016-08-17 21:39:45 +0200625if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200626 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200627 $log->error("Directory '$output' does not exist.");
628 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200629 };
630};
631
Akron63f20d42017-04-10 23:40:29 +0200632
633# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200634if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200635
Akron821db3d2017-04-06 21:19:31 +0200636 my @new_input = ();
637
638 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200639 foreach my $wild_card (@input) {
640
641 # Prefix with input root
642 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
643
644 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200645 };
646
Akron63f20d42017-04-10 23:40:29 +0200647 # Sort files by length
648 @input = sort { length($a) <=> length($b) } @new_input;
649
650 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200651};
652
653
Akron941c1a62016-02-23 17:41:41 +0100654# Process a single file
655unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100656 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000657
Akron941c1a62016-02-23 17:41:41 +0100658 BEGIN {
659 $main::TIME = Benchmark->new;
660 $main::LAST_STOP = Benchmark->new;
661 };
662
663 sub stop_time {
664 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200665 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100666 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200667 timestr(timediff($new, $main::LAST_STOP)) .
668 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
669 );
Akron941c1a62016-02-23 17:41:41 +0100670 $main::LAST_STOP = $new;
671 };
672
673 # Create and parse new document
674 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100675
Akron7d4cdd82016-08-17 21:39:45 +0200676 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200677 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100678
Akron11c80302016-03-18 19:44:43 +0100679 # Delete cache file
680 unlink($cache_file) if $cache_delete;
681
Akron5f51d422016-08-16 16:26:43 +0200682 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200683 exit;
Akron81500102017-04-07 20:45:44 +0200684};
685
Nils Diewald59094f22014-11-05 18:20:50 +0000686
Akrone10ad322016-02-27 10:54:26 +0100687# Extract XML files
Akron81500102017-04-07 20:45:44 +0200688if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100689
Akrond5643ad2017-07-04 20:27:13 +0200690 # Output is required
691 pod2usage(%ERROR_HASH) unless $output;
692
Akron7d4cdd82016-08-17 21:39:45 +0200693 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200694 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100695
Akron7d4cdd82016-08-17 21:39:45 +0200696 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100697 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200698 $log->error("Unzip is not installed or incompatible.");
699 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100700 };
701
Akronb0c88db2016-06-29 16:33:18 +0200702 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200703 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200704
Akron31a08cb2019-02-20 20:43:26 +0100705 # Will set @sigle
706 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200707
Akrone10ad322016-02-27 10:54:26 +0100708 # Iterate over all given sigles and extract
709 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100710
Akron2812ba22016-10-28 21:55:59 +0200711 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200712
Akron03b24db2016-08-16 20:54:32 +0200713 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200714 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100715
Akron955b75b2019-02-21 14:28:41 +0100716 # TODO:
717 # - prefix???
718 $archive->extract_sigle([$_], $output, $jobs)
719 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200720 );
Akrone10ad322016-02-27 10:54:26 +0100721 print "extracted.\n";
722 };
Akronb0c88db2016-06-29 16:33:18 +0200723 }
Akron7d4cdd82016-08-17 21:39:45 +0200724
725 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200726 else {
727 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200728 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100729 };
730}
731
Akron81500102017-04-07 20:45:44 +0200732
Akron941c1a62016-02-23 17:41:41 +0100733# Process an archive
734elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000735
Akron81500102017-04-07 20:45:44 +0200736 my $archive_output;
737
738 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100739 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200740
741 # Create new archive object
742 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
743
744 # Check zip capabilities
745 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200746 $log->error("Unzip is not installed or incompatible.");
747 exit 1;
Akron81500102017-04-07 20:45:44 +0200748 };
749
750 # Add further annotation archived
751 $archive->attach($_) foreach @input[1..$#input];
752
753 # Create a temporary directory
754 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200755 $extract_dir = tempdir(CLEANUP => 0);
756 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200757 };
758
Akron63f20d42017-04-10 23:40:29 +0200759 # Add some random extra to avoid clashes with multiple archives
760 $extract_dir = catdir($extract_dir, random_string('cccccc'));
761
Akron31a08cb2019-02-20 20:43:26 +0100762 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200763 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200764 @input = ($extract_dir);
765 }
766 else {
767 $log->error('Unable to extract from primary archive ' . $input[0] .
768 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200769 exit 1;
Akron81500102017-04-07 20:45:44 +0200770 };
771 }
772
773 # Can't create archive object
774 else {
775 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200776 exit 1;
Akron81500102017-04-07 20:45:44 +0200777 };
778 };
779
Akron7d4cdd82016-08-17 21:39:45 +0200780 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100781 my $pool = Parallel::ForkManager->new($jobs);
782
Akron7d4cdd82016-08-17 21:39:45 +0200783 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100784 my $iter = 1; # Current text in process
785
Akronda3097e2017-04-23 19:53:57 +0200786 my $tar_archive;
787 my $output_dir = $output;
788 my $tar_fh;
789
790 # Initialize tar archive
791 if ($to_tar) {
792 $tar_archive = Archive::Tar::Builder->new(
793 ignore_errors => 1
794 );
795
796 # Set output name
797 my $tar_file = $output;
798 unless ($tar_file =~ /\.tar$/) {
799 $tar_file .= '.tar';
800 };
801
802 # Initiate the tar file
803 print "Writing to file $tar_file\n";
804 $tar_fh = IO::File->new($tar_file, 'w');
805 $tar_fh->binmode(1);
806
807 # Set handle
808 $tar_archive->set_handle($tar_fh);
809
810 # Output to temporary directory
811 $output_dir = File::Temp->newdir;
812 };
813
Akron941c1a62016-02-23 17:41:41 +0100814 # Report on fork message
815 $pool->run_on_finish (
816 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200817 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100818 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200819
Akron08385f62016-03-22 20:37:04 +0100820 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200821 ($iter++) . "/$count]" .
822 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200823 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200824
825 if (!$code && $to_tar && $data->[2]) {
826 my $filename = $data->[2];
827
828 # Lock filehandle
829 if (flock($tar_fh, LOCK_EX)) {
830
Akron9a062ce2017-07-04 19:12:05 +0200831 my $clean_file = fileparse($filename);
832
Akronda3097e2017-04-23 19:53:57 +0200833 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200834 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200835 unlink $filename;
836
837 # Unlock filehandle
838 flock($tar_fh, LOCK_UN);
839 }
840 else {
841 $log->warn("Unable to add $filename to archive");
842 };
843 };
844
Akron4c0cf312016-10-15 16:42:09 +0200845 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100846 }
847 );
848
849 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200850 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100851 print "Reading data ...\n";
852
Akron7d4cdd82016-08-17 21:39:45 +0200853 # unless (Cache::FastMmap->new(
854 # share_file => $cache_file,
855 # cache_size => $cache_size,
856 # init_file => $cache_init
857 # )) {
858 # print "Unable to intialize cache '$cache_file'\n\n";
859 # exit(1);
860 # };
Akron11c80302016-03-18 19:44:43 +0100861
Akron486f9ab2017-04-22 23:25:19 +0200862
Akron941c1a62016-02-23 17:41:41 +0100863 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100864 if (-d $input[0]) {
865 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100866 my @dirs;
867 my $dir;
868
Akron7d4cdd82016-08-17 21:39:45 +0200869 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100870 while (1) {
871 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200872 push @dirs, $dir;
873 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100874 };
875 last unless $it->next;
876 };
877
878 print "Start processing ...\n";
879 $t = Benchmark->new;
880 $count = scalar @dirs;
881
882 DIRECTORY_LOOP:
883 for (my $i = 0; $i < $count; $i++) {
884
Akrone1dbc382016-07-08 22:24:52 +0200885 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200886 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200887 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200888 );
Akron941c1a62016-02-23 17:41:41 +0100889
890 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200891 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200892
Akron13d56622016-10-31 14:54:49 +0100893 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200894 $pool->finish(
895 0,
Akronda3097e2017-04-23 19:53:57 +0200896 [
897 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
898 undef,
899 $filename
900 ]
Akron486f9ab2017-04-22 23:25:19 +0200901 );
Akron3ec48972016-08-17 23:24:52 +0200902 }
903 else {
Akron4c0cf312016-10-15 16:42:09 +0200904 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200905 };
Akron941c1a62016-02-23 17:41:41 +0100906 };
907 }
908
909 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200910 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200911
Akron941c1a62016-02-23 17:41:41 +0100912 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200913 $log->error("Unzip is not installed or incompatible.");
914 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100915 };
916
Akron08385f62016-03-22 20:37:04 +0100917 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200918 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100919
Akron31a08cb2019-02-20 20:43:26 +0100920 # Get sigles to extract
921 my $prefix = set_sigle($archive);
922
Akron941c1a62016-02-23 17:41:41 +0100923 print "Start processing ...\n";
924 $t = Benchmark->new;
925 my @dirs = $archive->list_texts;
926 $count = scalar @dirs;
927
928 ARCHIVE_LOOP:
929 for (my $i = 0; $i < $count; $i++) {
930
931 # Split path information
932 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
933
Akrone1dbc382016-07-08 22:24:52 +0200934 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200935 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200936 get_file_name(
937 catfile($corpus, $doc, $text)
938 . '.json' . ($gzip ? '.gz' : '')
939 )
Akrone1dbc382016-07-08 22:24:52 +0200940 );
Akron941c1a62016-02-23 17:41:41 +0100941
942 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200943 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100944
Akron4c0cf312016-10-15 16:42:09 +0200945 # Create temporary file
946 $temp = File::Temp->newdir;
947
Akronbdf434a2016-10-24 17:42:07 +0200948 # TODO: Check if $filename exist at the beginning,
949 # because extraction can be horrible slow!
950
Akron941c1a62016-02-23 17:41:41 +0100951 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100952 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100953
Akron7d4cdd82016-08-17 21:39:45 +0200954 # Create corpus directory
955 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100956
Akron7d4cdd82016-08-17 21:39:45 +0200957 # Temporary directory
958 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100959
Akron7d4cdd82016-08-17 21:39:45 +0200960 # Write file
Akron13d56622016-10-31 14:54:49 +0100961 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200962
Akron4c0cf312016-10-15 16:42:09 +0200963 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100964 $pool->finish(
965 0,
Akronda3097e2017-04-23 19:53:57 +0200966 [
967 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
968 $temp,
969 $filename
970 ]
Akron13d56622016-10-31 14:54:49 +0100971 );
972 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200973 }
974 else {
Akron4c0cf312016-10-15 16:42:09 +0200975 # Delete temporary file
976 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200977 };
Akron941c1a62016-02-23 17:41:41 +0100978 }
Akron7d4cdd82016-08-17 21:39:45 +0200979
980 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100981 else {
Akron4c0cf312016-10-15 16:42:09 +0200982 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100983 };
984 };
985 }
986
987 else {
988 print "Input is neither a directory nor an archive.\n\n";
989 };
990
991 $pool->wait_all_children;
992
Akron11c80302016-03-18 19:44:43 +0100993 # Delete cache file
994 unlink($cache_file) if $cache_delete;
995
Akronda3097e2017-04-23 19:53:57 +0200996 # Close tar filehandle
997 if ($to_tar && $tar_fh) {
998 $tar_archive->finish;
999 $tar_fh->close;
1000 print "Wrote to tar archive.\n";
1001 };
1002
Akron63f20d42017-04-10 23:40:29 +02001003 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001004 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001005};
Akron941c1a62016-02-23 17:41:41 +01001006
Nils Diewald2db9ad02013-10-29 19:26:43 +00001007
Akron31a08cb2019-02-20 20:43:26 +01001008# For an archive, this will create the list
1009# of all sigles to process
1010sub set_sigle {
1011 my $archive = shift;
1012
1013 my $prefix = 1;
1014 my @dirs = ();
1015
1016 # No sigles given
1017 unless (@sigle) {
1018
1019 # Get files
1020 foreach ($archive->list_texts) {
1021
1022 push @dirs, $_;
1023
1024 # Split path information
1025 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1026
1027 # TODO: Make this OS independent
1028 push @sigle, join '/', $corpus, $doc, $text;
1029 };
1030 }
1031
1032 # Check sigle for doc sigles
1033 else {
1034 my @new_sigle;
1035
1036 my $prefix_check = 0;
1037
1038 # Iterate over all sigle
1039 foreach (@sigle) {
1040
1041 # Sigle is a doc sigle
1042 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1043
1044 print "$_ ...";
1045 # Check if a prefix is needed
1046 unless ($prefix_check) {
1047
1048 if ($prefix = $archive->check_prefix) {
1049 print " with prefix ...";
1050 };
1051 $prefix_check = 1;
1052 };
1053
1054 print "\n";
1055
Akron31a08cb2019-02-20 20:43:26 +01001056 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001057 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1058 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001059 );
1060 print "extracted.\n";
1061 }
1062
1063 # Sigle is a text sigle
1064 else {
1065 push @new_sigle, $_;
1066
1067 unless ($prefix_check) {
1068
1069 if ($prefix = $archive->check_prefix) {
1070 print " with prefix ...";
1071 };
1072 $prefix_check = 1;
1073 };
1074 };
1075 };
1076 @sigle = @new_sigle;
1077 };
1078
1079 return $prefix;
1080};
1081
1082
1083
Akron63f20d42017-04-10 23:40:29 +02001084# Cleanup temporary extraction directory
1085if ($extract_dir) {
1086 my $objects = remove_tree($extract_dir, { safe => 1 });
1087 print "Removed directory $extract_dir with $objects objects.\n";
1088};
1089
1090
1091print "\n";
1092
Nils Diewald2db9ad02013-10-29 19:26:43 +00001093__END__
Akron941c1a62016-02-23 17:41:41 +01001094
1095=pod
1096
1097=encoding utf8
1098
1099=head1 NAME
1100
Akronf7ad89e2016-03-16 18:22:47 +01001101korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001102
1103
1104=head1 SYNOPSIS
1105
Akrona76d8352016-10-27 16:27:32 +02001106 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001107
Akron2fd402b2016-10-27 21:26:48 +02001108
Akron941c1a62016-02-23 17:41:41 +01001109=head1 DESCRIPTION
1110
1111L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1112compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001113The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001114
1115
1116=head1 INSTALLATION
1117
1118The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1119
Akronaf386982016-10-12 00:33:25 +02001120 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001121
Akronc13a1702016-03-15 19:33:14 +01001122In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001123be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001124Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001125In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001126
1127=head1 ARGUMENTS
1128
Akrona76d8352016-10-27 16:27:32 +02001129 $ korapxml2krill -z --input <directory> --output <filename>
1130
1131Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001132It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001133
Akron941c1a62016-02-23 17:41:41 +01001134=over 2
1135
1136=item B<archive>
1137
Akron081639e2017-04-21 19:01:39 +02001138 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001139
Akron2fd402b2016-10-27 21:26:48 +02001140Converts an archive of KorAP-XML documents. It expects a directory
1141(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001142
1143=item B<extract>
1144
Akrona76d8352016-10-27 16:27:32 +02001145 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1146
1147Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001148
Akron63f20d42017-04-10 23:40:29 +02001149=item B<serial>
1150
1151 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1152
1153Convert archives sequentially. The inputs are not merged but treated
1154as they are (so they may be premerged or globs).
1155the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001156are created based on the archive name. In case the C<--to-tar> flag is given,
1157the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001158
1159
Akron941c1a62016-02-23 17:41:41 +01001160=back
1161
1162
1163=head1 OPTIONS
1164
1165=over 2
1166
Akrona76d8352016-10-27 16:27:32 +02001167=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001168
Akrona76d8352016-10-27 16:27:32 +02001169Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001170
Akron7606afa2016-10-25 16:23:49 +02001171Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001172document, while C<archive> expects a KorAP-XML corpus folder or a zip
1173file to batch process multiple files.
1174C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001175
Akrona76d8352016-10-27 16:27:32 +02001176C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001177that the first archive listed contains all primary data files
1178and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001179
Akron7606afa2016-10-25 16:23:49 +02001180 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001181
Akron821db3d2017-04-06 21:19:31 +02001182Input may also be defined using BSD glob wildcards.
1183
1184 -i 'file/news*.zip'
1185
1186The extended input array will be sorted in length order, so the shortest
1187path needs to contain all primary data files and all meta data files.
1188
Akron0c3e3752016-06-28 15:55:53 +02001189(The directory structure follows the base directory format,
1190that may include a C<.> root folder.
1191In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001192need to be passed with a hash sign in front of the archive's name.
1193This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001194
Akron7606afa2016-10-25 16:23:49 +02001195To support zip files, a version of C<unzip> needs to be installed that is
1196compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001197
Akron7606afa2016-10-25 16:23:49 +02001198B<The root folder switch using the hash sign is experimental and
1199may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001200
Akronf73ffb62018-06-27 12:13:59 +02001201
Akron63f20d42017-04-10 23:40:29 +02001202=item B<--input-base|-ib> <directory>
1203
1204The base directory for inputs.
1205
1206
Akron941c1a62016-02-23 17:41:41 +01001207=item B<--output|-o> <directory|file>
1208
1209Output folder for archive processing or
1210document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001211writes to C<STDOUT> by default
1212(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001213
1214=item B<--overwrite|-w>
1215
1216Overwrite files that already exist.
1217
Akronf73ffb62018-06-27 12:13:59 +02001218
Akron3741f8b2016-12-21 19:55:21 +01001219=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001220
1221Define the default tokenization by specifying
1222the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001223of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001224
Akron3741f8b2016-12-21 19:55:21 +01001225
1226=item B<--base-sentences|-bs> <foundry>#<layer>
1227
1228Define the layer for base sentences.
1229If given, this will be used instead of using C<Base#Sentences>.
1230Currently C<DeReKo#Structure> is the only additional layer supported.
1231
1232 Defaults to unset.
1233
1234
1235=item B<--base-paragraphs|-bp> <foundry>#<layer>
1236
1237Define the layer for base paragraphs.
1238If given, this will be used instead of using C<Base#Paragraphs>.
1239Currently C<DeReKo#Structure> is the only additional layer supported.
1240
1241 Defaults to unset.
1242
1243
Akron41ac10b2017-02-08 22:47:25 +01001244=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1245
1246Define the layer for base pagebreaks.
1247Currently C<DeReKo#Structure> is the only layer supported.
1248
1249 Defaults to unset.
1250
1251
Akron941c1a62016-02-23 17:41:41 +01001252=item B<--skip|-s> <foundry>[#<layer>]
1253
Akronf7ad89e2016-03-16 18:22:47 +01001254Skip specific annotations by specifying the foundry
1255(and optionally the layer with a C<#>-prefix),
1256e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001257Can be set multiple times.
1258
Akronf73ffb62018-06-27 12:13:59 +02001259
Akronc13a1702016-03-15 19:33:14 +01001260=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001261
Akronf7ad89e2016-03-16 18:22:47 +01001262Convert specific annotations by specifying the foundry
1263(and optionally the layer with a C<#>-prefix),
1264e.g. C<Mate> or C<Mate#Morpho>.
1265Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001266
Akronf73ffb62018-06-27 12:13:59 +02001267
Akron941c1a62016-02-23 17:41:41 +01001268=item B<--primary|-p>
1269
Akronc13a1702016-03-15 19:33:14 +01001270Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001271Can be flagged using C<--no-primary> as well.
1272This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001273
Akronf73ffb62018-06-27 12:13:59 +02001274
Akroned9baf02019-01-22 17:03:25 +01001275=item B<--non-word-tokens|-nwt>
1276
1277Tokenize non-word tokens like word tokens (defined as matching
1278C</[\d\w]/>). Useful to treat punctuations as tokens.
1279
1280 Defaults to unset.
1281
Akron941c1a62016-02-23 17:41:41 +01001282=item B<--jobs|-j>
1283
1284Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001285for archive processing.
Akron11c80302016-03-18 19:44:43 +01001286Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001287
1288If C<sequential-extraction> is not set to false, this will
1289also apply to extraction.
1290
Akronc11f7982017-02-21 21:20:14 +01001291Pass -1, and the value will be set automatically to 5
1292times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001293This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001294
Akronf73ffb62018-06-27 12:13:59 +02001295
Akron263274c2019-02-07 09:48:30 +01001296=item B<--koral|-k>
1297
1298Version of the output format. Supported versions are:
1299C<0> for legacy serialization, C<0.03> for serialization
1300with metadata fields as key-values on the root object,
1301C<0.4> for serialization with metadata fields as a list
1302of C<"@type":"koral:field"> objects.
1303
1304Currently defaults to C<0.03>.
1305
1306
Akron9ec88872017-04-12 16:29:06 +02001307=item B<--sequential-extraction|-se>
1308
1309Flag to indicate, if the C<jobs> value also applies to extraction.
1310Some systems may have problems with extracting multiple archives
1311to the same folder at the same time.
1312Can be flagged using C<--no-sequential-extraction> as well.
1313Defaults to C<false>.
1314
Akronf73ffb62018-06-27 12:13:59 +02001315
Akron35db6e32016-03-17 22:42:22 +01001316=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001317
Akron35db6e32016-03-17 22:42:22 +01001318Define the metadata parser to use. Defaults to C<I5>.
1319Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1320This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001321
Akronf73ffb62018-06-27 12:13:59 +02001322
Akron941c1a62016-02-23 17:41:41 +01001323=item B<--pretty|-y>
1324
Akronc13a1702016-03-15 19:33:14 +01001325Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001326This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001327
Akronf73ffb62018-06-27 12:13:59 +02001328
Akron941c1a62016-02-23 17:41:41 +01001329=item B<--gzip|-z>
1330
Akronf7ad89e2016-03-16 18:22:47 +01001331Compress the output.
1332Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001333
Akronf73ffb62018-06-27 12:13:59 +02001334
Akron11c80302016-03-18 19:44:43 +01001335=item B<--cache|-c>
1336
1337File to mmap a cache (using L<Cache::FastMmap>).
1338Defaults to C<korapxml2krill.cache> in the calling directory.
1339
Akronf73ffb62018-06-27 12:13:59 +02001340
Akron11c80302016-03-18 19:44:43 +01001341=item B<--cache-size|-cs>
1342
1343Size of the cache. Defaults to C<50m>.
1344
Akronf73ffb62018-06-27 12:13:59 +02001345
Akron11c80302016-03-18 19:44:43 +01001346=item B<--cache-init|-ci>
1347
1348Initialize cache file.
1349Can be flagged using C<--no-cache-init> as well.
1350Defaults to C<true>.
1351
Akronf73ffb62018-06-27 12:13:59 +02001352
Akron11c80302016-03-18 19:44:43 +01001353=item B<--cache-delete|-cd>
1354
1355Delete cache file after processing.
1356Can be flagged using C<--no-cache-delete> as well.
1357Defaults to C<true>.
1358
Akronf73ffb62018-06-27 12:13:59 +02001359
Akron636aa112017-04-07 18:48:56 +02001360=item B<--config|-cfg>
1361
1362Configure the parameters of your call in a file
1363of key-value pairs with whitespace separator
1364
1365 overwrite 1
1366 token DeReKo#Structure
1367 ...
1368
1369Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001370C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001371C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001372C<output>, C<koral>,
1373C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001374C<base-sentences>, C<base-paragraphs>,
1375C<base-pagebreaks>,
1376C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001377(semicolon separated), C<anno> (semicolon separated).
1378
Akronf73ffb62018-06-27 12:13:59 +02001379Configuration parameters will always be overwritten by
1380passed parameters.
1381
1382
Akron81500102017-04-07 20:45:44 +02001383=item B<--temporary-extract|-te>
1384
1385Only valid for the C<archive> command.
1386
1387This will first extract all files into a
1388directory and then will archive.
1389If the directory is given as C<:temp:>,
1390a temporary directory is used.
1391This is especially useful to avoid
1392massive unzipping and potential
1393network latency.
Akron636aa112017-04-07 18:48:56 +02001394
Akronf73ffb62018-06-27 12:13:59 +02001395
Akronc93a0802019-07-11 15:48:34 +02001396=item B<--to-tar>
1397
1398Only valid for the C<archive> command.
1399
1400Writes the output into a tar archive.
1401
1402
Akrone10ad322016-02-27 10:54:26 +01001403=item B<--sigle|-sg>
1404
Akron20807582016-10-26 17:11:34 +02001405Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001406Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001407I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001408Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001409In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001410On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001411
Akronf73ffb62018-06-27 12:13:59 +02001412
Akron941c1a62016-02-23 17:41:41 +01001413=item B<--log|-l>
1414
1415The L<Log4perl> log level, defaults to C<ERROR>.
1416
Akronf73ffb62018-06-27 12:13:59 +02001417
Akron941c1a62016-02-23 17:41:41 +01001418=item B<--help|-h>
1419
1420Print this document.
1421
Akronf73ffb62018-06-27 12:13:59 +02001422
Akron941c1a62016-02-23 17:41:41 +01001423=item B<--version|-v>
1424
1425Print version information.
1426
1427=back
1428
Akronf73ffb62018-06-27 12:13:59 +02001429
Akronc13a1702016-03-15 19:33:14 +01001430=head1 ANNOTATION SUPPORT
1431
1432L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1433developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1434The base foundry with paragraphs, sentences, and the text element are mandatory for
1435L<Krill|https://github.com/KorAP/Krill>.
1436
Akron821db3d2017-04-06 21:19:31 +02001437 Base
1438 #Paragraphs
1439 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001440
Akron821db3d2017-04-06 21:19:31 +02001441 Connexor
1442 #Morpho
1443 #Phrase
1444 #Sentences
1445 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001446
Akron821db3d2017-04-06 21:19:31 +02001447 CoreNLP
1448 #Constituency
1449 #Morpho
1450 #NamedEntities
1451 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001452
Akronce125b62017-06-19 11:54:36 +02001453 CMC
1454 #Morpho
1455
Akron821db3d2017-04-06 21:19:31 +02001456 DeReKo
1457 #Structure
Akronc13a1702016-03-15 19:33:14 +01001458
Akron57510c12019-01-04 14:58:53 +01001459 DGD
1460 #Morpho
1461
Akron821db3d2017-04-06 21:19:31 +02001462 DRuKoLa
1463 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001464
Akron821db3d2017-04-06 21:19:31 +02001465 Glemm
1466 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001467
Akronea1aed52018-07-19 14:43:34 +02001468 HNC
1469 #Morpho
1470
Akron4c679192018-01-16 17:41:49 +01001471 LWC
1472 #Dependency
1473
Akron821db3d2017-04-06 21:19:31 +02001474 Malt
1475 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001476
Akron821db3d2017-04-06 21:19:31 +02001477 MarMoT
1478 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001479
Akron821db3d2017-04-06 21:19:31 +02001480 Mate
1481 #Dependency
1482 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001483
Akron821db3d2017-04-06 21:19:31 +02001484 MDParser
1485 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001486
Akron821db3d2017-04-06 21:19:31 +02001487 OpenNLP
1488 #Morpho
1489 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001490
Akron821db3d2017-04-06 21:19:31 +02001491 Sgbr
1492 #Lemma
1493 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001494
Akron7d5e6382019-08-08 16:36:27 +02001495 Talismane
1496 #Dependency
1497 #Morpho
1498
Akron821db3d2017-04-06 21:19:31 +02001499 TreeTagger
1500 #Morpho
1501 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001502
Akron821db3d2017-04-06 21:19:31 +02001503 XIP
1504 #Constituency
1505 #Morpho
1506 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001507
Akronc13a1702016-03-15 19:33:14 +01001508
1509More importers are in preparation.
1510New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1511See the built-in annotation importers as examples.
1512
Akronf73ffb62018-06-27 12:13:59 +02001513
Akron941c1a62016-02-23 17:41:41 +01001514=head1 AVAILABILITY
1515
1516 https://github.com/KorAP/KorAP-XML-Krill
1517
1518
1519=head1 COPYRIGHT AND LICENSE
1520
Akroned9baf02019-01-22 17:03:25 +01001521Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001522
Akron941c1a62016-02-23 17:41:41 +01001523Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001524
Akrona76d8352016-10-27 16:27:32 +02001525Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001526
1527L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1528Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001529L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001530member of the
1531L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1532
1533This program is free software published under the
1534L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1535
1536=cut