blob: 831aa2bafc7c937b58ce22efb5459abdce2e2653 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron941c1a62016-02-23 17:41:41 +0100141# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100142
Akron263274c2019-02-07 09:48:30 +0100143our $LAST_CHANGE = '2019/02/07';
Akron941c1a62016-02-23 17:41:41 +0100144our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100145our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100146our $VERSION_MSG = <<"VERSION";
147Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
148VERSION
149
Akron63f20d42017-04-10 23:40:29 +0200150# Prototypes
151sub get_file_name_from_glob($);
152sub get_file_name($);
153
Akron941c1a62016-02-23 17:41:41 +0100154# Parse comand
155my $cmd;
156our @ARGV;
157if ($ARGV[0] && index($ARGV[0], '-') != 0) {
158 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100159};
Akron63f20d42017-04-10 23:40:29 +0200160my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100161
Akron5f51d422016-08-16 16:26:43 +0200162my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100163my $text;
Akrone10ad322016-02-27 10:54:26 +0100164
Akron941c1a62016-02-23 17:41:41 +0100165# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000166GetOptions(
Akron08385f62016-03-22 20:37:04 +0100167 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200168 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100169 'output|o=s' => \(my $output),
170 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100171 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200172 'token|t=s' => \(my $token_base),
173 'base-sentences|bs=s' => \(my $base_sentences),
174 'base-paragraphs|bp=s' => \(my $base_paragraphs),
175 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100176 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200177 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100178 'skip|s=s' => \@skip,
179 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200180 'cache|c=s' => \(my $cache_file),
181 'config|cfg=s' => \(my $cfg_file),
182 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200183 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100184 'primary|p!' => \(my $primary),
185 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200186 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100187 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200188 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100189 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200190 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200191 'cache-size|cs=s' => \(my $cache_size),
192 'cache-delete|cd!' => \(my $cache_delete),
193 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100194 'help|h' => sub {
195 pod2usage(
196 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200197 -verbose => 99,
198 -msg => $VERSION_MSG,
199 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100200 );
201 },
202 'version|v' => sub {
203 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200204 -verbose => 0,
205 -msg => $VERSION_MSG,
206 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100207 )
208 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000209);
210
Akron63f20d42017-04-10 23:40:29 +0200211
Akron636aa112017-04-07 18:48:56 +0200212# Load from configuration
213if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200214 my %config;
215
216 Config::Simple->import_from($cfg_file, \%config);
217
218 # Overwrite
219 if (!defined($overwrite) && defined $config{overwrite}) {
220 $overwrite = $config{overwrite};
221 };
222
223 # Gzip
224 if (!defined($gzip) && defined $config{gzip}) {
225 $gzip = $config{gzip};
226 };
227
228 # Jobs
229 if (!defined($jobs) && defined $config{jobs}) {
230 $jobs = $config{jobs};
231 };
232
Akron263274c2019-02-07 09:48:30 +0100233 # Koral version
234 if (!defined($koral) && defined $config{koral}) {
235 $koral = $config{koral};
236 };
237
Akron63f20d42017-04-10 23:40:29 +0200238 # Input root base directory
239 if (!defined($input_base) && defined $config{'input-base'}) {
240 $input_base = $config{'input-base'};
241 };
242
Akron81500102017-04-07 20:45:44 +0200243 # temporary-extract
244 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
245 $extract_dir = $config{'temporary-extract'};
246 };
247
Akron636aa112017-04-07 18:48:56 +0200248 # Token base
249 if (!defined($token_base) && defined $config{token}) {
250 $token_base = $config{token};
251 };
252
Akroned9baf02019-01-22 17:03:25 +0100253 # temporary-extract
254 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
255 $non_word_tokens = $config{'non-word-tokens'};
256 };
257
Akron636aa112017-04-07 18:48:56 +0200258 # Cache file
259 if (!defined($cache_file) && defined $config{cache}) {
260 $cache_file = $config{cache};
261 };
262
263 # Cache size
264 if (!defined($cache_size) && defined $config{'cache-size'}) {
265 $cache_size = $config{'cache-size'};
266 };
267
268 # Cache delete
269 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
270 $cache_delete = $config{'cache-delete'} ;
271 };
272
273 # Cache init
274 if (!(defined $cache_init) && defined $config{'cache-init'}) {
275 $cache_init = $config{'cache-init'} ;
276 };
277
Akron9ec88872017-04-12 16:29:06 +0200278 # Jobs for extraction
279 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
280 $sequential_extraction = $config{'sequential-extraction'} ;
281 };
282
Akron636aa112017-04-07 18:48:56 +0200283 # Meta
284 if (!(defined $meta) && defined $config{'meta'}) {
285 $meta = $config{'meta'} ;
286 };
287
288 # Output
289 if (!(defined $output) && defined $config{'output'}) {
290 $output = $config{'output'} ;
291 };
292
293 # Base-sentences
294 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
295 $base_sentences = $config{'base-sentences'} ;
296 };
297
298 # Base-paragraphs
299 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
300 $base_paragraphs = $config{'base-paragraphs'} ;
301 };
302
303 # Base-pagebreaks
304 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
305 $base_pagebreaks = $config{'base-pagebreaks'} ;
306 };
307
Akron081639e2017-04-21 19:01:39 +0200308 # Write to tar
309 if (!(defined $to_tar) && defined $config{'to-tar'}) {
310 $to_tar = $config{'to-tar'} ;
311 };
312
Akron636aa112017-04-07 18:48:56 +0200313 # Log
314 if (!(defined $log_level) && defined $config{'log'}) {
315 $log_level = $config{'log'} ;
316 };
317
318 # Skip
319 if (!scalar(@skip) && defined $config{'skip'}) {
320 @skip = split /\s*;\s*/, $config{'skip'} ;
321 };
322
323 # Sigle
324 if (!scalar(@sigle) && defined $config{'sigle'}) {
325 @sigle = split /\s*;\s*/, $config{'sigle'} ;
326 };
327
328 # Anno
329 if (!scalar(@anno) && defined $config{'anno'}) {
330 @anno = split /\s*;\s*/, $config{'anno'} ;
331 };
332};
333
Akron63f20d42017-04-10 23:40:29 +0200334
Akron636aa112017-04-07 18:48:56 +0200335# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200336$token_base //= 'OpenNLP#tokens';
337$cache_file //= 'korapxml2krill.cache';
338$cache_size //= '50m';
339$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100340$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200341$cache_delete //= 1;
342$cache_init //= 1;
343$sequential_extraction //= 0;
344$log_level //= 'ERROR';
345$base_sentences //= '';
346$base_paragraphs //= '';
347$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100348$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200349
Akron821db3d2017-04-06 21:19:31 +0200350$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100351$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100352$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100353
Akron63f20d42017-04-10 23:40:29 +0200354
355# Initialize log4perl object
356Log::Log4perl->init({
357 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
358 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
359 'log4perl.appender.STDERR.layout' => 'PatternLayout',
360 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
361});
362
363my $log = Log::Log4perl->get_logger('main');
364
365
366print "Reading config from $cfg_file\n" if $cfg_file;
367
368
Akron941c1a62016-02-23 17:41:41 +0100369my %ERROR_HASH = (
370 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200371 -verbose => 99,
372 -msg => $VERSION_MSG,
373 -output => '-',
374 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100375);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000376
Akron941c1a62016-02-23 17:41:41 +0100377# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100378pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000379
Akrone1dbc382016-07-08 22:24:52 +0200380# Gzip has no effect, if no output is given
381pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000382
Akronc11f7982017-02-21 21:20:14 +0100383
Akron636aa112017-04-07 18:48:56 +0200384if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100385 state $cores = Sys::Info->new->device('CPU')->count;
386 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200387 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100388};
389
Akron821db3d2017-04-06 21:19:31 +0200390
Akron63f20d42017-04-10 23:40:29 +0200391# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200392if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200393
Akron486f9ab2017-04-22 23:25:19 +0200394 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200395 $log->error("Directory '$output' does not exist.");
396 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200397 };
398
399 # Remove all inputs
400 my $remove_next = 0;
401 @keep_argv = @{c(@keep_argv)->grep(
402 sub {
403 # Input flag
404 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
405 $remove_next = 1;
406 return 0;
407 }
408
409 # input value
410 elsif ($remove_next) {
411 $remove_next = 0;
412 return 0;
413 };
414
415 # Pass parameter
416 return 1;
417 }
418 )->to_array};
419
420
421 # Iterate over all inputs
422 foreach (@input) {
423
Akron081639e2017-04-21 19:01:39 +0200424 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200425 my $new_out = catdir($output, get_file_name_from_glob($_));
426
Akron486f9ab2017-04-22 23:25:19 +0200427 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200428 unless ($to_tar) {
429 if (make_path($new_out) == 0 && !-d $new_out) {
430 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200431 exit 1;
Akron081639e2017-04-21 19:01:39 +0200432 };
Akron63f20d42017-04-10 23:40:29 +0200433 };
434
435 # Create archive command
436 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
437 print "Start serial processing of $_ to $new_out\n";
438
439 # Start archiving
440 system @archive_cmd;
441 };
442
Akron3abc03e2017-06-29 16:23:35 +0200443 exit;
Akron63f20d42017-04-10 23:40:29 +0200444};
445
Akrone1dbc382016-07-08 22:24:52 +0200446my %skip;
447$skip{lc($_)} = 1 foreach @skip;
448
449my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100450push(@layers, ['Base', 'Sentences']) unless $base_sentences;
451push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200452
453# Connexor
454push(@layers, ['Connexor', 'Morpho']);
455push(@layers, ['Connexor', 'Syntax']);
456push(@layers, ['Connexor', 'Phrase']);
457push(@layers, ['Connexor', 'Sentences']);
458
459# CoreNLP
460push(@layers, ['CoreNLP', 'NamedEntities']);
461push(@layers, ['CoreNLP', 'Sentences']);
462push(@layers, ['CoreNLP', 'Morpho']);
463push(@layers, ['CoreNLP', 'Constituency']);
464
Akronce125b62017-06-19 11:54:36 +0200465# CMC
466push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100467
Akrone1dbc382016-07-08 22:24:52 +0200468# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100469my @dereko_attr = ();
470if ($base_sentences eq 'dereko#structure') {
471 push @dereko_attr, 'sentences';
472};
473if ($base_paragraphs eq 'dereko#structure') {
474 push @dereko_attr, 'paragraphs';
475};
Akron636bd9c2017-02-09 17:13:00 +0100476
Akron41ac10b2017-02-08 22:47:25 +0100477if ($base_pagebreaks eq 'dereko#structure') {
478 push @dereko_attr, 'pagebreaks';
479};
480
481if ($dereko_attr[0]) {
482 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100483}
484else {
485 push(@layers, ['DeReKo', 'Structure']);
486};
Akrone1dbc382016-07-08 22:24:52 +0200487
488# Glemm
489push(@layers, ['Glemm', 'Morpho']);
490
Akronea1aed52018-07-19 14:43:34 +0200491# HNC
492push(@layers, ['HNC', 'Morpho']);
493
Akron4c679192018-01-16 17:41:49 +0100494# LWC
495push(@layers, ['LWC', 'Dependency']);
496
Akrone1dbc382016-07-08 22:24:52 +0200497# Malt
498push(@layers, ['Malt', 'Dependency']);
499
500# MDParser
501push(@layers, ['MDParser', 'Dependency']);
502
503# Mate
504push(@layers, ['Mate', 'Morpho']);
505push(@layers, ['Mate', 'Dependency']);
506
507# OpenNLP
508push(@layers, ['OpenNLP', 'Morpho']);
509push(@layers, ['OpenNLP', 'Sentences']);
510
511# Schreibgebrauch
512push(@layers, ['Sgbr', 'Lemma']);
513push(@layers, ['Sgbr', 'Morpho']);
514
515# TreeTagger
516push(@layers, ['TreeTagger', 'Morpho']);
517push(@layers, ['TreeTagger', 'Sentences']);
518
519# XIP
520push(@layers, ['XIP', 'Morpho']);
521push(@layers, ['XIP', 'Constituency']);
522push(@layers, ['XIP', 'Sentences']);
523push(@layers, ['XIP', 'Dependency']);
524
Akron4fa37c32017-01-20 14:43:10 +0100525# DRuKoLa
526push(@layers, ['DRuKoLa', 'Morpho']);
527
Akron3bd942f2017-02-20 20:09:14 +0100528# Marmot
529push(@layers, ['MarMoT', 'Morpho']);
530
Akron4fa37c32017-01-20 14:43:10 +0100531
Akrone1dbc382016-07-08 22:24:52 +0200532# Check filters
533my @filtered_anno;
534if ($skip{'#all'}) {
535 foreach (@anno) {
536 push @filtered_anno, [ split('#', $_) ];
537 };
538}
539
540# Add all annotations that are not skipped
541else {
542 # Add to index file - respect skipping
543 foreach my $info (@layers) {
544 # Skip if Foundry or Foundry#Layer should be skipped
545 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
546 push @filtered_anno, $info;
547 };
548 };
549};
550
551# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200552my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
553
554# Remove file extension
555$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200556
557# TODO: This should not be initialized for batch
558my $cache = Cache::FastMmap->new(
559 share_file => $cache_file,
560 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200561 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200562);
563
Akron03b24db2016-08-16 20:54:32 +0200564# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200565my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200566 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200567 meta_type => $meta,
568 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200569 foundry => $token_base_foundry,
570 layer => $token_base_layer,
571 gzip => $gzip,
572 log => $log,
Akron263274c2019-02-07 09:48:30 +0100573 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200574 primary => $primary,
575 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100576 anno => \@filtered_anno,
577 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200578);
579
Akron941c1a62016-02-23 17:41:41 +0100580# Get file name based on path information
581sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100582 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200583 if (-d $i) {
584 $i =~ s![^\/]+$!!;
585 };
Akron941c1a62016-02-23 17:41:41 +0100586 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200587
588 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200589 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100590 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100591 $file =~ tr/\//-/;
592 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200593 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100594 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000595};
596
Akron63f20d42017-04-10 23:40:29 +0200597
598sub get_file_name_from_glob ($) {
599 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200600 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200601 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
602 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
603 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
604 $glob =~ s/^-//; # Clean beginning
605 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200606 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200607 return $glob;
608};
609
610
Akrone10ad322016-02-27 10:54:26 +0100611# Convert sigle to path construct
612s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
613
Akron7d4cdd82016-08-17 21:39:45 +0200614if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200615 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200616 $log->error("Directory '$output' does not exist.");
617 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200618 };
619};
620
Akron63f20d42017-04-10 23:40:29 +0200621
622# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200623if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200624
Akron821db3d2017-04-06 21:19:31 +0200625 my @new_input = ();
626
627 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200628 foreach my $wild_card (@input) {
629
630 # Prefix with input root
631 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
632
633 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200634 };
635
Akron63f20d42017-04-10 23:40:29 +0200636 # Sort files by length
637 @input = sort { length($a) <=> length($b) } @new_input;
638
639 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200640};
641
642
Akron941c1a62016-02-23 17:41:41 +0100643# Process a single file
644unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100645 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000646
Akron941c1a62016-02-23 17:41:41 +0100647 BEGIN {
648 $main::TIME = Benchmark->new;
649 $main::LAST_STOP = Benchmark->new;
650 };
651
652 sub stop_time {
653 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200654 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100655 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200656 timestr(timediff($new, $main::LAST_STOP)) .
657 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
658 );
Akron941c1a62016-02-23 17:41:41 +0100659 $main::LAST_STOP = $new;
660 };
661
662 # Create and parse new document
663 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100664
Akron7d4cdd82016-08-17 21:39:45 +0200665 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200666 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100667
Akron11c80302016-03-18 19:44:43 +0100668 # Delete cache file
669 unlink($cache_file) if $cache_delete;
670
Akron5f51d422016-08-16 16:26:43 +0200671 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200672 exit;
Akron81500102017-04-07 20:45:44 +0200673};
674
Nils Diewald59094f22014-11-05 18:20:50 +0000675
Akrone10ad322016-02-27 10:54:26 +0100676# Extract XML files
Akron81500102017-04-07 20:45:44 +0200677if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100678
Akrond5643ad2017-07-04 20:27:13 +0200679 # Output is required
680 pod2usage(%ERROR_HASH) unless $output;
681
Akron7d4cdd82016-08-17 21:39:45 +0200682 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200683 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100684
Akron7d4cdd82016-08-17 21:39:45 +0200685 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100686 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200687 $log->error("Unzip is not installed or incompatible.");
688 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100689 };
690
Akronb0c88db2016-06-29 16:33:18 +0200691 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200692 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200693
Akron31a08cb2019-02-20 20:43:26 +0100694 # Will set @sigle
695 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200696
Akron31a08cb2019-02-20 20:43:26 +0100697# my $prefix = 1;
698#
699# # No sigles given
700# unless (@sigle) {
701#
702# # Get files
703# foreach ($archive->list_texts) {
704#
705# # Split path information
706# ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
707#
708# # TODO: Make this OS independent
709# push @sigle, join '/', $corpus, $doc, $text;
710# };
711# }
712#
713# # Check sigle for doc sigles
714# else {
715# my @new_sigle;
716#
717# my $prefix_check = 0;
718#
719# # Iterate over all sigle
720# foreach (@sigle) {
721#
722# # Sigle is a doc sigle
723# if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
724#
725# print "$_ ...";
726# # Check if a prefix is needed
727# unless ($prefix_check) {
728#
729# if ($prefix = $archive->check_prefix) {
730# print " with prefix ...";
731# };
732# $prefix_check = 1;
733# };
734#
735# print "\n";
736#
737# # TODO: Make this OS independent
738# my $path = ($prefix ? './' : '') . $_;
739#
740# print '... ' . (
741# $archive->extract_doc(
742# $path, $output, $sequential_extraction ? 1 : $jobs
743# ) ? '' : 'not '
744# );
745# print "extracted.\n";
746# }
747#
748# # Sigle is a text sigle
749# else {
750# push @new_sigle, $_;
751#
752# unless ($prefix_check) {
753#
754# if ($prefix = $archive->check_prefix) {
755# print " with prefix ...";
756# };
757# $prefix_check = 1;
758# };
759# };
760# };
761# @sigle = @new_sigle;
762# };
Akron03b24db2016-08-16 20:54:32 +0200763
Akrone10ad322016-02-27 10:54:26 +0100764 # Iterate over all given sigles and extract
765 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100766
Akron2812ba22016-10-28 21:55:59 +0200767 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200768
Akron03b24db2016-08-16 20:54:32 +0200769 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200770 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100771
Akron955b75b2019-02-21 14:28:41 +0100772 # TODO:
773 # - prefix???
774 $archive->extract_sigle([$_], $output, $jobs)
775 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200776 );
Akrone10ad322016-02-27 10:54:26 +0100777 print "extracted.\n";
778 };
Akronb0c88db2016-06-29 16:33:18 +0200779 }
Akron7d4cdd82016-08-17 21:39:45 +0200780
781 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200782 else {
783 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200784 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100785 };
786}
787
Akron81500102017-04-07 20:45:44 +0200788
Akron941c1a62016-02-23 17:41:41 +0100789# Process an archive
790elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000791
Akron81500102017-04-07 20:45:44 +0200792 my $archive_output;
793
794 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100795 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200796
797 # Create new archive object
798 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
799
800 # Check zip capabilities
801 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200802 $log->error("Unzip is not installed or incompatible.");
803 exit 1;
Akron81500102017-04-07 20:45:44 +0200804 };
805
806 # Add further annotation archived
807 $archive->attach($_) foreach @input[1..$#input];
808
809 # Create a temporary directory
810 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200811 $extract_dir = tempdir(CLEANUP => 0);
812 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200813 };
814
Akron63f20d42017-04-10 23:40:29 +0200815 # Add some random extra to avoid clashes with multiple archives
816 $extract_dir = catdir($extract_dir, random_string('cccccc'));
817
Akron31a08cb2019-02-20 20:43:26 +0100818 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200819 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200820 @input = ($extract_dir);
821 }
822 else {
823 $log->error('Unable to extract from primary archive ' . $input[0] .
824 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200825 exit 1;
Akron81500102017-04-07 20:45:44 +0200826 };
827 }
828
829 # Can't create archive object
830 else {
831 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200832 exit 1;
Akron81500102017-04-07 20:45:44 +0200833 };
834 };
835
Akron7d4cdd82016-08-17 21:39:45 +0200836 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100837 my $pool = Parallel::ForkManager->new($jobs);
838
Akron7d4cdd82016-08-17 21:39:45 +0200839 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100840 my $iter = 1; # Current text in process
841
Akronda3097e2017-04-23 19:53:57 +0200842 my $tar_archive;
843 my $output_dir = $output;
844 my $tar_fh;
845
846 # Initialize tar archive
847 if ($to_tar) {
848 $tar_archive = Archive::Tar::Builder->new(
849 ignore_errors => 1
850 );
851
852 # Set output name
853 my $tar_file = $output;
854 unless ($tar_file =~ /\.tar$/) {
855 $tar_file .= '.tar';
856 };
857
858 # Initiate the tar file
859 print "Writing to file $tar_file\n";
860 $tar_fh = IO::File->new($tar_file, 'w');
861 $tar_fh->binmode(1);
862
863 # Set handle
864 $tar_archive->set_handle($tar_fh);
865
866 # Output to temporary directory
867 $output_dir = File::Temp->newdir;
868 };
869
Akron941c1a62016-02-23 17:41:41 +0100870 # Report on fork message
871 $pool->run_on_finish (
872 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200873 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100874 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200875
Akron08385f62016-03-22 20:37:04 +0100876 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200877 ($iter++) . "/$count]" .
878 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200879 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200880
881 if (!$code && $to_tar && $data->[2]) {
882 my $filename = $data->[2];
883
884 # Lock filehandle
885 if (flock($tar_fh, LOCK_EX)) {
886
Akron9a062ce2017-07-04 19:12:05 +0200887 my $clean_file = fileparse($filename);
888
Akronda3097e2017-04-23 19:53:57 +0200889 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200890 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200891 unlink $filename;
892
893 # Unlock filehandle
894 flock($tar_fh, LOCK_UN);
895 }
896 else {
897 $log->warn("Unable to add $filename to archive");
898 };
899 };
900
Akron4c0cf312016-10-15 16:42:09 +0200901 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100902 }
903 );
904
905 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200906 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100907 print "Reading data ...\n";
908
Akron7d4cdd82016-08-17 21:39:45 +0200909 # unless (Cache::FastMmap->new(
910 # share_file => $cache_file,
911 # cache_size => $cache_size,
912 # init_file => $cache_init
913 # )) {
914 # print "Unable to intialize cache '$cache_file'\n\n";
915 # exit(1);
916 # };
Akron11c80302016-03-18 19:44:43 +0100917
Akron486f9ab2017-04-22 23:25:19 +0200918
Akron941c1a62016-02-23 17:41:41 +0100919 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100920 if (-d $input[0]) {
921 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100922 my @dirs;
923 my $dir;
924
Akron7d4cdd82016-08-17 21:39:45 +0200925 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100926 while (1) {
927 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200928 push @dirs, $dir;
929 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100930 };
931 last unless $it->next;
932 };
933
934 print "Start processing ...\n";
935 $t = Benchmark->new;
936 $count = scalar @dirs;
937
938 DIRECTORY_LOOP:
939 for (my $i = 0; $i < $count; $i++) {
940
Akrone1dbc382016-07-08 22:24:52 +0200941 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200942 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200943 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200944 );
Akron941c1a62016-02-23 17:41:41 +0100945
946 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200947 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200948
Akron13d56622016-10-31 14:54:49 +0100949 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200950 $pool->finish(
951 0,
Akronda3097e2017-04-23 19:53:57 +0200952 [
953 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
954 undef,
955 $filename
956 ]
Akron486f9ab2017-04-22 23:25:19 +0200957 );
Akron3ec48972016-08-17 23:24:52 +0200958 }
959 else {
Akron4c0cf312016-10-15 16:42:09 +0200960 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200961 };
Akron941c1a62016-02-23 17:41:41 +0100962 };
963 }
964
965 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200966 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200967
Akron941c1a62016-02-23 17:41:41 +0100968 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200969 $log->error("Unzip is not installed or incompatible.");
970 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100971 };
972
Akron08385f62016-03-22 20:37:04 +0100973 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200974 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100975
Akron31a08cb2019-02-20 20:43:26 +0100976 # Get sigles to extract
977 my $prefix = set_sigle($archive);
978
Akron941c1a62016-02-23 17:41:41 +0100979 print "Start processing ...\n";
980 $t = Benchmark->new;
981 my @dirs = $archive->list_texts;
982 $count = scalar @dirs;
983
984 ARCHIVE_LOOP:
985 for (my $i = 0; $i < $count; $i++) {
986
987 # Split path information
988 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
989
Akrone1dbc382016-07-08 22:24:52 +0200990 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200991 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200992 get_file_name(
993 catfile($corpus, $doc, $text)
994 . '.json' . ($gzip ? '.gz' : '')
995 )
Akrone1dbc382016-07-08 22:24:52 +0200996 );
Akron941c1a62016-02-23 17:41:41 +0100997
998 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200999 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +01001000
Akron4c0cf312016-10-15 16:42:09 +02001001 # Create temporary file
1002 $temp = File::Temp->newdir;
1003
Akronbdf434a2016-10-24 17:42:07 +02001004 # TODO: Check if $filename exist at the beginning,
1005 # because extraction can be horrible slow!
1006
Akron941c1a62016-02-23 17:41:41 +01001007 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +01001008 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +01001009
Akron7d4cdd82016-08-17 21:39:45 +02001010 # Create corpus directory
1011 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +01001012
Akron7d4cdd82016-08-17 21:39:45 +02001013 # Temporary directory
1014 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +01001015
Akron7d4cdd82016-08-17 21:39:45 +02001016 # Write file
Akron13d56622016-10-31 14:54:49 +01001017 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +02001018
Akron4c0cf312016-10-15 16:42:09 +02001019 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +01001020 $pool->finish(
1021 0,
Akronda3097e2017-04-23 19:53:57 +02001022 [
1023 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
1024 $temp,
1025 $filename
1026 ]
Akron13d56622016-10-31 14:54:49 +01001027 );
1028 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001029 }
1030 else {
Akron4c0cf312016-10-15 16:42:09 +02001031 # Delete temporary file
1032 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001033 };
Akron941c1a62016-02-23 17:41:41 +01001034 }
Akron7d4cdd82016-08-17 21:39:45 +02001035
1036 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001037 else {
Akron4c0cf312016-10-15 16:42:09 +02001038 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001039 };
1040 };
1041 }
1042
1043 else {
1044 print "Input is neither a directory nor an archive.\n\n";
1045 };
1046
1047 $pool->wait_all_children;
1048
Akron11c80302016-03-18 19:44:43 +01001049 # Delete cache file
1050 unlink($cache_file) if $cache_delete;
1051
Akronda3097e2017-04-23 19:53:57 +02001052 # Close tar filehandle
1053 if ($to_tar && $tar_fh) {
1054 $tar_archive->finish;
1055 $tar_fh->close;
1056 print "Wrote to tar archive.\n";
1057 };
1058
Akron63f20d42017-04-10 23:40:29 +02001059 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001060 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001061};
Akron941c1a62016-02-23 17:41:41 +01001062
Nils Diewald2db9ad02013-10-29 19:26:43 +00001063
Akron31a08cb2019-02-20 20:43:26 +01001064# For an archive, this will create the list
1065# of all sigles to process
1066sub set_sigle {
1067 my $archive = shift;
1068
1069 my $prefix = 1;
1070 my @dirs = ();
1071
1072 # No sigles given
1073 unless (@sigle) {
1074
1075 # Get files
1076 foreach ($archive->list_texts) {
1077
1078 push @dirs, $_;
1079
1080 # Split path information
1081 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1082
1083 # TODO: Make this OS independent
1084 push @sigle, join '/', $corpus, $doc, $text;
1085 };
1086 }
1087
1088 # Check sigle for doc sigles
1089 else {
1090 my @new_sigle;
1091
1092 my $prefix_check = 0;
1093
1094 # Iterate over all sigle
1095 foreach (@sigle) {
1096
1097 # Sigle is a doc sigle
1098 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1099
1100 print "$_ ...";
1101 # Check if a prefix is needed
1102 unless ($prefix_check) {
1103
1104 if ($prefix = $archive->check_prefix) {
1105 print " with prefix ...";
1106 };
1107 $prefix_check = 1;
1108 };
1109
1110 print "\n";
1111
Akron31a08cb2019-02-20 20:43:26 +01001112 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001113 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1114 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001115 );
1116 print "extracted.\n";
1117 }
1118
1119 # Sigle is a text sigle
1120 else {
1121 push @new_sigle, $_;
1122
1123 unless ($prefix_check) {
1124
1125 if ($prefix = $archive->check_prefix) {
1126 print " with prefix ...";
1127 };
1128 $prefix_check = 1;
1129 };
1130 };
1131 };
1132 @sigle = @new_sigle;
1133 };
1134
1135 return $prefix;
1136};
1137
1138
1139
Akron63f20d42017-04-10 23:40:29 +02001140# Cleanup temporary extraction directory
1141if ($extract_dir) {
1142 my $objects = remove_tree($extract_dir, { safe => 1 });
1143 print "Removed directory $extract_dir with $objects objects.\n";
1144};
1145
1146
1147print "\n";
1148
Nils Diewald2db9ad02013-10-29 19:26:43 +00001149__END__
Akron941c1a62016-02-23 17:41:41 +01001150
1151=pod
1152
1153=encoding utf8
1154
1155=head1 NAME
1156
Akronf7ad89e2016-03-16 18:22:47 +01001157korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001158
1159
1160=head1 SYNOPSIS
1161
Akrona76d8352016-10-27 16:27:32 +02001162 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001163
Akron2fd402b2016-10-27 21:26:48 +02001164
Akron941c1a62016-02-23 17:41:41 +01001165=head1 DESCRIPTION
1166
1167L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1168compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001169The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001170
1171
1172=head1 INSTALLATION
1173
1174The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1175
Akronaf386982016-10-12 00:33:25 +02001176 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001177
Akronc13a1702016-03-15 19:33:14 +01001178In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001179be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001180Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001181In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001182
1183=head1 ARGUMENTS
1184
Akrona76d8352016-10-27 16:27:32 +02001185 $ korapxml2krill -z --input <directory> --output <filename>
1186
1187Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001188It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001189
Akron941c1a62016-02-23 17:41:41 +01001190=over 2
1191
1192=item B<archive>
1193
Akron081639e2017-04-21 19:01:39 +02001194 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001195
Akron2fd402b2016-10-27 21:26:48 +02001196Converts an archive of KorAP-XML documents. It expects a directory
1197(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001198
1199=item B<extract>
1200
Akrona76d8352016-10-27 16:27:32 +02001201 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1202
1203Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001204
Akron63f20d42017-04-10 23:40:29 +02001205=item B<serial>
1206
1207 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1208
1209Convert archives sequentially. The inputs are not merged but treated
1210as they are (so they may be premerged or globs).
1211the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001212are created based on the archive name. In case the C<--to-tar> flag is given,
1213the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001214
1215
Akron941c1a62016-02-23 17:41:41 +01001216=back
1217
1218
1219=head1 OPTIONS
1220
1221=over 2
1222
Akrona76d8352016-10-27 16:27:32 +02001223=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001224
Akrona76d8352016-10-27 16:27:32 +02001225Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001226
Akron7606afa2016-10-25 16:23:49 +02001227Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001228document, while C<archive> expects a KorAP-XML corpus folder or a zip
1229file to batch process multiple files.
1230C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001231
Akrona76d8352016-10-27 16:27:32 +02001232C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001233that the first archive listed contains all primary data files
1234and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001235
Akron7606afa2016-10-25 16:23:49 +02001236 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001237
Akron821db3d2017-04-06 21:19:31 +02001238Input may also be defined using BSD glob wildcards.
1239
1240 -i 'file/news*.zip'
1241
1242The extended input array will be sorted in length order, so the shortest
1243path needs to contain all primary data files and all meta data files.
1244
Akron0c3e3752016-06-28 15:55:53 +02001245(The directory structure follows the base directory format,
1246that may include a C<.> root folder.
1247In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001248need to be passed with a hash sign in front of the archive's name.
1249This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001250
Akron7606afa2016-10-25 16:23:49 +02001251To support zip files, a version of C<unzip> needs to be installed that is
1252compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001253
Akron7606afa2016-10-25 16:23:49 +02001254B<The root folder switch using the hash sign is experimental and
1255may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001256
Akronf73ffb62018-06-27 12:13:59 +02001257
Akron63f20d42017-04-10 23:40:29 +02001258=item B<--input-base|-ib> <directory>
1259
1260The base directory for inputs.
1261
1262
Akron941c1a62016-02-23 17:41:41 +01001263=item B<--output|-o> <directory|file>
1264
1265Output folder for archive processing or
1266document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001267writes to C<STDOUT> by default
1268(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001269
1270=item B<--overwrite|-w>
1271
1272Overwrite files that already exist.
1273
Akronf73ffb62018-06-27 12:13:59 +02001274
Akron3741f8b2016-12-21 19:55:21 +01001275=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001276
1277Define the default tokenization by specifying
1278the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001279of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001280
Akron3741f8b2016-12-21 19:55:21 +01001281
1282=item B<--base-sentences|-bs> <foundry>#<layer>
1283
1284Define the layer for base sentences.
1285If given, this will be used instead of using C<Base#Sentences>.
1286Currently C<DeReKo#Structure> is the only additional layer supported.
1287
1288 Defaults to unset.
1289
1290
1291=item B<--base-paragraphs|-bp> <foundry>#<layer>
1292
1293Define the layer for base paragraphs.
1294If given, this will be used instead of using C<Base#Paragraphs>.
1295Currently C<DeReKo#Structure> is the only additional layer supported.
1296
1297 Defaults to unset.
1298
1299
Akron41ac10b2017-02-08 22:47:25 +01001300=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1301
1302Define the layer for base pagebreaks.
1303Currently C<DeReKo#Structure> is the only layer supported.
1304
1305 Defaults to unset.
1306
1307
Akron941c1a62016-02-23 17:41:41 +01001308=item B<--skip|-s> <foundry>[#<layer>]
1309
Akronf7ad89e2016-03-16 18:22:47 +01001310Skip specific annotations by specifying the foundry
1311(and optionally the layer with a C<#>-prefix),
1312e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001313Can be set multiple times.
1314
Akronf73ffb62018-06-27 12:13:59 +02001315
Akronc13a1702016-03-15 19:33:14 +01001316=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001317
Akronf7ad89e2016-03-16 18:22:47 +01001318Convert specific annotations by specifying the foundry
1319(and optionally the layer with a C<#>-prefix),
1320e.g. C<Mate> or C<Mate#Morpho>.
1321Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001322
Akronf73ffb62018-06-27 12:13:59 +02001323
Akron941c1a62016-02-23 17:41:41 +01001324=item B<--primary|-p>
1325
Akronc13a1702016-03-15 19:33:14 +01001326Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001327Can be flagged using C<--no-primary> as well.
1328This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akroned9baf02019-01-22 17:03:25 +01001331=item B<--non-word-tokens|-nwt>
1332
1333Tokenize non-word tokens like word tokens (defined as matching
1334C</[\d\w]/>). Useful to treat punctuations as tokens.
1335
1336 Defaults to unset.
1337
Akron941c1a62016-02-23 17:41:41 +01001338=item B<--jobs|-j>
1339
1340Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001341for archive processing.
Akron11c80302016-03-18 19:44:43 +01001342Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001343
1344If C<sequential-extraction> is not set to false, this will
1345also apply to extraction.
1346
Akronc11f7982017-02-21 21:20:14 +01001347Pass -1, and the value will be set automatically to 5
1348times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001349This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001350
Akronf73ffb62018-06-27 12:13:59 +02001351
Akron263274c2019-02-07 09:48:30 +01001352=item B<--koral|-k>
1353
1354Version of the output format. Supported versions are:
1355C<0> for legacy serialization, C<0.03> for serialization
1356with metadata fields as key-values on the root object,
1357C<0.4> for serialization with metadata fields as a list
1358of C<"@type":"koral:field"> objects.
1359
1360Currently defaults to C<0.03>.
1361
1362
Akron9ec88872017-04-12 16:29:06 +02001363=item B<--sequential-extraction|-se>
1364
1365Flag to indicate, if the C<jobs> value also applies to extraction.
1366Some systems may have problems with extracting multiple archives
1367to the same folder at the same time.
1368Can be flagged using C<--no-sequential-extraction> as well.
1369Defaults to C<false>.
1370
Akronf73ffb62018-06-27 12:13:59 +02001371
Akron35db6e32016-03-17 22:42:22 +01001372=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001373
Akron35db6e32016-03-17 22:42:22 +01001374Define the metadata parser to use. Defaults to C<I5>.
1375Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1376This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001377
Akronf73ffb62018-06-27 12:13:59 +02001378
Akron941c1a62016-02-23 17:41:41 +01001379=item B<--pretty|-y>
1380
Akronc13a1702016-03-15 19:33:14 +01001381Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001382This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001383
Akronf73ffb62018-06-27 12:13:59 +02001384
Akron941c1a62016-02-23 17:41:41 +01001385=item B<--gzip|-z>
1386
Akronf7ad89e2016-03-16 18:22:47 +01001387Compress the output.
1388Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001389
Akronf73ffb62018-06-27 12:13:59 +02001390
Akron11c80302016-03-18 19:44:43 +01001391=item B<--cache|-c>
1392
1393File to mmap a cache (using L<Cache::FastMmap>).
1394Defaults to C<korapxml2krill.cache> in the calling directory.
1395
Akronf73ffb62018-06-27 12:13:59 +02001396
Akron11c80302016-03-18 19:44:43 +01001397=item B<--cache-size|-cs>
1398
1399Size of the cache. Defaults to C<50m>.
1400
Akronf73ffb62018-06-27 12:13:59 +02001401
Akron11c80302016-03-18 19:44:43 +01001402=item B<--cache-init|-ci>
1403
1404Initialize cache file.
1405Can be flagged using C<--no-cache-init> as well.
1406Defaults to C<true>.
1407
Akronf73ffb62018-06-27 12:13:59 +02001408
Akron11c80302016-03-18 19:44:43 +01001409=item B<--cache-delete|-cd>
1410
1411Delete cache file after processing.
1412Can be flagged using C<--no-cache-delete> as well.
1413Defaults to C<true>.
1414
Akronf73ffb62018-06-27 12:13:59 +02001415
Akron636aa112017-04-07 18:48:56 +02001416=item B<--config|-cfg>
1417
1418Configure the parameters of your call in a file
1419of key-value pairs with whitespace separator
1420
1421 overwrite 1
1422 token DeReKo#Structure
1423 ...
1424
1425Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001426C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001427C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001428C<output>, C<koral>,
1429C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001430C<base-sentences>, C<base-paragraphs>,
1431C<base-pagebreaks>,
1432C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001433(semicolon separated), C<anno> (semicolon separated).
1434
Akronf73ffb62018-06-27 12:13:59 +02001435Configuration parameters will always be overwritten by
1436passed parameters.
1437
1438
Akron81500102017-04-07 20:45:44 +02001439=item B<--temporary-extract|-te>
1440
1441Only valid for the C<archive> command.
1442
1443This will first extract all files into a
1444directory and then will archive.
1445If the directory is given as C<:temp:>,
1446a temporary directory is used.
1447This is especially useful to avoid
1448massive unzipping and potential
1449network latency.
Akron636aa112017-04-07 18:48:56 +02001450
Akronf73ffb62018-06-27 12:13:59 +02001451
Akrone10ad322016-02-27 10:54:26 +01001452=item B<--sigle|-sg>
1453
Akron20807582016-10-26 17:11:34 +02001454Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001455Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001456I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001457Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001458In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001459On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001460
Akronf73ffb62018-06-27 12:13:59 +02001461
Akron941c1a62016-02-23 17:41:41 +01001462=item B<--log|-l>
1463
1464The L<Log4perl> log level, defaults to C<ERROR>.
1465
Akronf73ffb62018-06-27 12:13:59 +02001466
Akron941c1a62016-02-23 17:41:41 +01001467=item B<--help|-h>
1468
1469Print this document.
1470
Akronf73ffb62018-06-27 12:13:59 +02001471
Akron941c1a62016-02-23 17:41:41 +01001472=item B<--version|-v>
1473
1474Print version information.
1475
1476=back
1477
Akronf73ffb62018-06-27 12:13:59 +02001478
Akronc13a1702016-03-15 19:33:14 +01001479=head1 ANNOTATION SUPPORT
1480
1481L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1482developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1483The base foundry with paragraphs, sentences, and the text element are mandatory for
1484L<Krill|https://github.com/KorAP/Krill>.
1485
Akron821db3d2017-04-06 21:19:31 +02001486 Base
1487 #Paragraphs
1488 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001489
Akron821db3d2017-04-06 21:19:31 +02001490 Connexor
1491 #Morpho
1492 #Phrase
1493 #Sentences
1494 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001495
Akron821db3d2017-04-06 21:19:31 +02001496 CoreNLP
1497 #Constituency
1498 #Morpho
1499 #NamedEntities
1500 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001501
Akronce125b62017-06-19 11:54:36 +02001502 CMC
1503 #Morpho
1504
Akron821db3d2017-04-06 21:19:31 +02001505 DeReKo
1506 #Structure
Akronc13a1702016-03-15 19:33:14 +01001507
Akron821db3d2017-04-06 21:19:31 +02001508 DRuKoLa
1509 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001510
Akron821db3d2017-04-06 21:19:31 +02001511 Glemm
1512 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001513
Akronea1aed52018-07-19 14:43:34 +02001514 HNC
1515 #Morpho
1516
Akron4c679192018-01-16 17:41:49 +01001517 LWC
1518 #Dependency
1519
Akron821db3d2017-04-06 21:19:31 +02001520 Malt
1521 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001522
Akron821db3d2017-04-06 21:19:31 +02001523 MarMoT
1524 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001525
Akron821db3d2017-04-06 21:19:31 +02001526 Mate
1527 #Dependency
1528 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001529
Akron821db3d2017-04-06 21:19:31 +02001530 MDParser
1531 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001532
Akron821db3d2017-04-06 21:19:31 +02001533 OpenNLP
1534 #Morpho
1535 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001536
Akron821db3d2017-04-06 21:19:31 +02001537 Sgbr
1538 #Lemma
1539 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001540
Akron821db3d2017-04-06 21:19:31 +02001541 TreeTagger
1542 #Morpho
1543 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001544
Akron821db3d2017-04-06 21:19:31 +02001545 XIP
1546 #Constituency
1547 #Morpho
1548 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001549
Akronc13a1702016-03-15 19:33:14 +01001550
1551More importers are in preparation.
1552New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1553See the built-in annotation importers as examples.
1554
Akronf73ffb62018-06-27 12:13:59 +02001555
Akron941c1a62016-02-23 17:41:41 +01001556=head1 AVAILABILITY
1557
1558 https://github.com/KorAP/KorAP-XML-Krill
1559
1560
1561=head1 COPYRIGHT AND LICENSE
1562
Akroned9baf02019-01-22 17:03:25 +01001563Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001564
Akron941c1a62016-02-23 17:41:41 +01001565Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001566
Akrona76d8352016-10-27 16:27:32 +02001567Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001568
1569L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1570Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001571L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001572member of the
1573L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1574
1575This program is free software published under the
1576L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1577
1578=cut