blob: d53004b5042c28a29a29fba9b865d050562719ff [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron941c1a62016-02-23 17:41:41 +0100141# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100142
Akron263274c2019-02-07 09:48:30 +0100143our $LAST_CHANGE = '2019/02/07';
Akron941c1a62016-02-23 17:41:41 +0100144our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100145our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100146our $VERSION_MSG = <<"VERSION";
147Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
148VERSION
149
Akron63f20d42017-04-10 23:40:29 +0200150# Prototypes
151sub get_file_name_from_glob($);
152sub get_file_name($);
153
Akron941c1a62016-02-23 17:41:41 +0100154# Parse comand
155my $cmd;
156our @ARGV;
157if ($ARGV[0] && index($ARGV[0], '-') != 0) {
158 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100159};
Akron63f20d42017-04-10 23:40:29 +0200160my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100161
Akron5f51d422016-08-16 16:26:43 +0200162my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100163my $text;
Akrone10ad322016-02-27 10:54:26 +0100164
Akron941c1a62016-02-23 17:41:41 +0100165# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000166GetOptions(
Akron08385f62016-03-22 20:37:04 +0100167 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200168 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100169 'output|o=s' => \(my $output),
170 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100171 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200172 'token|t=s' => \(my $token_base),
173 'base-sentences|bs=s' => \(my $base_sentences),
174 'base-paragraphs|bp=s' => \(my $base_paragraphs),
175 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100176 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200177 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100178 'skip|s=s' => \@skip,
179 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200180 'cache|c=s' => \(my $cache_file),
181 'config|cfg=s' => \(my $cfg_file),
182 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200183 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100184 'primary|p!' => \(my $primary),
185 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200186 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100187 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200188 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100189 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200190 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200191 'cache-size|cs=s' => \(my $cache_size),
192 'cache-delete|cd!' => \(my $cache_delete),
193 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100194 'help|h' => sub {
195 pod2usage(
196 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200197 -verbose => 99,
198 -msg => $VERSION_MSG,
199 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100200 );
201 },
202 'version|v' => sub {
203 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200204 -verbose => 0,
205 -msg => $VERSION_MSG,
206 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100207 )
208 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000209);
210
Akron63f20d42017-04-10 23:40:29 +0200211
Akron636aa112017-04-07 18:48:56 +0200212# Load from configuration
213if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200214 my %config;
215
216 Config::Simple->import_from($cfg_file, \%config);
217
218 # Overwrite
219 if (!defined($overwrite) && defined $config{overwrite}) {
220 $overwrite = $config{overwrite};
221 };
222
223 # Gzip
224 if (!defined($gzip) && defined $config{gzip}) {
225 $gzip = $config{gzip};
226 };
227
228 # Jobs
229 if (!defined($jobs) && defined $config{jobs}) {
230 $jobs = $config{jobs};
231 };
232
Akron263274c2019-02-07 09:48:30 +0100233 # Koral version
234 if (!defined($koral) && defined $config{koral}) {
235 $koral = $config{koral};
236 };
237
Akron63f20d42017-04-10 23:40:29 +0200238 # Input root base directory
239 if (!defined($input_base) && defined $config{'input-base'}) {
240 $input_base = $config{'input-base'};
241 };
242
Akron81500102017-04-07 20:45:44 +0200243 # temporary-extract
244 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
245 $extract_dir = $config{'temporary-extract'};
246 };
247
Akron636aa112017-04-07 18:48:56 +0200248 # Token base
249 if (!defined($token_base) && defined $config{token}) {
250 $token_base = $config{token};
251 };
252
Akroned9baf02019-01-22 17:03:25 +0100253 # temporary-extract
254 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
255 $non_word_tokens = $config{'non-word-tokens'};
256 };
257
Akron636aa112017-04-07 18:48:56 +0200258 # Cache file
259 if (!defined($cache_file) && defined $config{cache}) {
260 $cache_file = $config{cache};
261 };
262
263 # Cache size
264 if (!defined($cache_size) && defined $config{'cache-size'}) {
265 $cache_size = $config{'cache-size'};
266 };
267
268 # Cache delete
269 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
270 $cache_delete = $config{'cache-delete'} ;
271 };
272
273 # Cache init
274 if (!(defined $cache_init) && defined $config{'cache-init'}) {
275 $cache_init = $config{'cache-init'} ;
276 };
277
Akron9ec88872017-04-12 16:29:06 +0200278 # Jobs for extraction
279 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
280 $sequential_extraction = $config{'sequential-extraction'} ;
281 };
282
Akron636aa112017-04-07 18:48:56 +0200283 # Meta
284 if (!(defined $meta) && defined $config{'meta'}) {
285 $meta = $config{'meta'} ;
286 };
287
288 # Output
289 if (!(defined $output) && defined $config{'output'}) {
290 $output = $config{'output'} ;
291 };
292
293 # Base-sentences
294 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
295 $base_sentences = $config{'base-sentences'} ;
296 };
297
298 # Base-paragraphs
299 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
300 $base_paragraphs = $config{'base-paragraphs'} ;
301 };
302
303 # Base-pagebreaks
304 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
305 $base_pagebreaks = $config{'base-pagebreaks'} ;
306 };
307
Akron081639e2017-04-21 19:01:39 +0200308 # Write to tar
309 if (!(defined $to_tar) && defined $config{'to-tar'}) {
310 $to_tar = $config{'to-tar'} ;
311 };
312
Akron636aa112017-04-07 18:48:56 +0200313 # Log
314 if (!(defined $log_level) && defined $config{'log'}) {
315 $log_level = $config{'log'} ;
316 };
317
318 # Skip
319 if (!scalar(@skip) && defined $config{'skip'}) {
320 @skip = split /\s*;\s*/, $config{'skip'} ;
321 };
322
323 # Sigle
324 if (!scalar(@sigle) && defined $config{'sigle'}) {
325 @sigle = split /\s*;\s*/, $config{'sigle'} ;
326 };
327
328 # Anno
329 if (!scalar(@anno) && defined $config{'anno'}) {
330 @anno = split /\s*;\s*/, $config{'anno'} ;
331 };
332};
333
Akron63f20d42017-04-10 23:40:29 +0200334
Akron636aa112017-04-07 18:48:56 +0200335# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200336$token_base //= 'OpenNLP#tokens';
337$cache_file //= 'korapxml2krill.cache';
338$cache_size //= '50m';
339$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100340$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200341$cache_delete //= 1;
342$cache_init //= 1;
343$sequential_extraction //= 0;
344$log_level //= 'ERROR';
345$base_sentences //= '';
346$base_paragraphs //= '';
347$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100348$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200349
Akron821db3d2017-04-06 21:19:31 +0200350$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100351$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100352$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100353
Akron63f20d42017-04-10 23:40:29 +0200354
355# Initialize log4perl object
356Log::Log4perl->init({
357 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
358 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
359 'log4perl.appender.STDERR.layout' => 'PatternLayout',
360 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
361});
362
363my $log = Log::Log4perl->get_logger('main');
364
365
366print "Reading config from $cfg_file\n" if $cfg_file;
367
368
Akron941c1a62016-02-23 17:41:41 +0100369my %ERROR_HASH = (
370 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200371 -verbose => 99,
372 -msg => $VERSION_MSG,
373 -output => '-',
374 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100375);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000376
Akron941c1a62016-02-23 17:41:41 +0100377# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100378pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000379
Akrone1dbc382016-07-08 22:24:52 +0200380# Gzip has no effect, if no output is given
381pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000382
Akronc11f7982017-02-21 21:20:14 +0100383
Akron636aa112017-04-07 18:48:56 +0200384if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100385 state $cores = Sys::Info->new->device('CPU')->count;
386 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200387 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100388};
389
Akron821db3d2017-04-06 21:19:31 +0200390
Akron63f20d42017-04-10 23:40:29 +0200391# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200392if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200393
Akron486f9ab2017-04-22 23:25:19 +0200394 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200395 $log->error("Directory '$output' does not exist.");
396 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200397 };
398
399 # Remove all inputs
400 my $remove_next = 0;
401 @keep_argv = @{c(@keep_argv)->grep(
402 sub {
403 # Input flag
404 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
405 $remove_next = 1;
406 return 0;
407 }
408
409 # input value
410 elsif ($remove_next) {
411 $remove_next = 0;
412 return 0;
413 };
414
415 # Pass parameter
416 return 1;
417 }
418 )->to_array};
419
420
421 # Iterate over all inputs
422 foreach (@input) {
423
Akron081639e2017-04-21 19:01:39 +0200424 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200425 my $new_out = catdir($output, get_file_name_from_glob($_));
426
Akron486f9ab2017-04-22 23:25:19 +0200427 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200428 unless ($to_tar) {
429 if (make_path($new_out) == 0 && !-d $new_out) {
430 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200431 exit 1;
Akron081639e2017-04-21 19:01:39 +0200432 };
Akron63f20d42017-04-10 23:40:29 +0200433 };
434
435 # Create archive command
436 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
437 print "Start serial processing of $_ to $new_out\n";
438
439 # Start archiving
440 system @archive_cmd;
441 };
442
Akron3abc03e2017-06-29 16:23:35 +0200443 exit;
Akron63f20d42017-04-10 23:40:29 +0200444};
445
Akrone1dbc382016-07-08 22:24:52 +0200446my %skip;
447$skip{lc($_)} = 1 foreach @skip;
448
449my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100450push(@layers, ['Base', 'Sentences']) unless $base_sentences;
451push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200452
453# Connexor
454push(@layers, ['Connexor', 'Morpho']);
455push(@layers, ['Connexor', 'Syntax']);
456push(@layers, ['Connexor', 'Phrase']);
457push(@layers, ['Connexor', 'Sentences']);
458
459# CoreNLP
460push(@layers, ['CoreNLP', 'NamedEntities']);
461push(@layers, ['CoreNLP', 'Sentences']);
462push(@layers, ['CoreNLP', 'Morpho']);
463push(@layers, ['CoreNLP', 'Constituency']);
464
Akronce125b62017-06-19 11:54:36 +0200465# CMC
466push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100467
Akrone1dbc382016-07-08 22:24:52 +0200468# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100469my @dereko_attr = ();
470if ($base_sentences eq 'dereko#structure') {
471 push @dereko_attr, 'sentences';
472};
473if ($base_paragraphs eq 'dereko#structure') {
474 push @dereko_attr, 'paragraphs';
475};
Akron636bd9c2017-02-09 17:13:00 +0100476
Akron41ac10b2017-02-08 22:47:25 +0100477if ($base_pagebreaks eq 'dereko#structure') {
478 push @dereko_attr, 'pagebreaks';
479};
480
481if ($dereko_attr[0]) {
482 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100483}
484else {
485 push(@layers, ['DeReKo', 'Structure']);
486};
Akrone1dbc382016-07-08 22:24:52 +0200487
488# Glemm
489push(@layers, ['Glemm', 'Morpho']);
490
Akronea1aed52018-07-19 14:43:34 +0200491# HNC
492push(@layers, ['HNC', 'Morpho']);
493
Akron4c679192018-01-16 17:41:49 +0100494# LWC
495push(@layers, ['LWC', 'Dependency']);
496
Akrone1dbc382016-07-08 22:24:52 +0200497# Malt
498push(@layers, ['Malt', 'Dependency']);
499
500# MDParser
501push(@layers, ['MDParser', 'Dependency']);
502
503# Mate
504push(@layers, ['Mate', 'Morpho']);
505push(@layers, ['Mate', 'Dependency']);
506
507# OpenNLP
508push(@layers, ['OpenNLP', 'Morpho']);
509push(@layers, ['OpenNLP', 'Sentences']);
510
511# Schreibgebrauch
512push(@layers, ['Sgbr', 'Lemma']);
513push(@layers, ['Sgbr', 'Morpho']);
514
515# TreeTagger
516push(@layers, ['TreeTagger', 'Morpho']);
517push(@layers, ['TreeTagger', 'Sentences']);
518
519# XIP
520push(@layers, ['XIP', 'Morpho']);
521push(@layers, ['XIP', 'Constituency']);
522push(@layers, ['XIP', 'Sentences']);
523push(@layers, ['XIP', 'Dependency']);
524
Akron4fa37c32017-01-20 14:43:10 +0100525# DRuKoLa
526push(@layers, ['DRuKoLa', 'Morpho']);
527
Akron3bd942f2017-02-20 20:09:14 +0100528# Marmot
529push(@layers, ['MarMoT', 'Morpho']);
530
Akron4fa37c32017-01-20 14:43:10 +0100531
Akrone1dbc382016-07-08 22:24:52 +0200532# Check filters
533my @filtered_anno;
534if ($skip{'#all'}) {
535 foreach (@anno) {
536 push @filtered_anno, [ split('#', $_) ];
537 };
538}
539
540# Add all annotations that are not skipped
541else {
542 # Add to index file - respect skipping
543 foreach my $info (@layers) {
544 # Skip if Foundry or Foundry#Layer should be skipped
545 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
546 push @filtered_anno, $info;
547 };
548 };
549};
550
551# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200552my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
553
554# Remove file extension
555$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200556
557# TODO: This should not be initialized for batch
558my $cache = Cache::FastMmap->new(
559 share_file => $cache_file,
560 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200561 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200562);
563
Akron03b24db2016-08-16 20:54:32 +0200564# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200565my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200566 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200567 meta_type => $meta,
568 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200569 foundry => $token_base_foundry,
570 layer => $token_base_layer,
571 gzip => $gzip,
572 log => $log,
Akron263274c2019-02-07 09:48:30 +0100573 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200574 primary => $primary,
575 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100576 anno => \@filtered_anno,
577 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200578);
579
Akron941c1a62016-02-23 17:41:41 +0100580# Get file name based on path information
581sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100582 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200583 if (-d $i) {
584 $i =~ s![^\/]+$!!;
585 };
Akron941c1a62016-02-23 17:41:41 +0100586 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200587
588 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200589 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100590 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100591 $file =~ tr/\//-/;
592 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200593 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100594 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000595};
596
Akron63f20d42017-04-10 23:40:29 +0200597
598sub get_file_name_from_glob ($) {
599 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200600 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200601 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
602 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
603 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
604 $glob =~ s/^-//; # Clean beginning
605 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200606 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200607 return $glob;
608};
609
610
Akrone10ad322016-02-27 10:54:26 +0100611# Convert sigle to path construct
612s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
613
Akron7d4cdd82016-08-17 21:39:45 +0200614if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200615 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200616 $log->error("Directory '$output' does not exist.");
617 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200618 };
619};
620
Akron63f20d42017-04-10 23:40:29 +0200621
622# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200623if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200624
Akron821db3d2017-04-06 21:19:31 +0200625 my @new_input = ();
626
627 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200628 foreach my $wild_card (@input) {
629
630 # Prefix with input root
631 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
632
633 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200634 };
635
Akron63f20d42017-04-10 23:40:29 +0200636 # Sort files by length
637 @input = sort { length($a) <=> length($b) } @new_input;
638
639 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200640};
641
642
Akron941c1a62016-02-23 17:41:41 +0100643# Process a single file
644unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100645 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000646
Akron941c1a62016-02-23 17:41:41 +0100647 BEGIN {
648 $main::TIME = Benchmark->new;
649 $main::LAST_STOP = Benchmark->new;
650 };
651
652 sub stop_time {
653 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200654 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100655 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200656 timestr(timediff($new, $main::LAST_STOP)) .
657 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
658 );
Akron941c1a62016-02-23 17:41:41 +0100659 $main::LAST_STOP = $new;
660 };
661
662 # Create and parse new document
663 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100664
Akron7d4cdd82016-08-17 21:39:45 +0200665 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200666 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100667
Akron11c80302016-03-18 19:44:43 +0100668 # Delete cache file
669 unlink($cache_file) if $cache_delete;
670
Akron5f51d422016-08-16 16:26:43 +0200671 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200672 exit;
Akron81500102017-04-07 20:45:44 +0200673};
674
Nils Diewald59094f22014-11-05 18:20:50 +0000675
Akrone10ad322016-02-27 10:54:26 +0100676# Extract XML files
Akron81500102017-04-07 20:45:44 +0200677if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100678
Akrond5643ad2017-07-04 20:27:13 +0200679 # Output is required
680 pod2usage(%ERROR_HASH) unless $output;
681
Akron7d4cdd82016-08-17 21:39:45 +0200682 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200683 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100684
Akron7d4cdd82016-08-17 21:39:45 +0200685 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100686 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200687 $log->error("Unzip is not installed or incompatible.");
688 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100689 };
690
Akronb0c88db2016-06-29 16:33:18 +0200691 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200692 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200693
Akron31a08cb2019-02-20 20:43:26 +0100694 # Will set @sigle
695 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200696
Akron31a08cb2019-02-20 20:43:26 +0100697# my $prefix = 1;
698#
699# # No sigles given
700# unless (@sigle) {
701#
702# # Get files
703# foreach ($archive->list_texts) {
704#
705# # Split path information
706# ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
707#
708# # TODO: Make this OS independent
709# push @sigle, join '/', $corpus, $doc, $text;
710# };
711# }
712#
713# # Check sigle for doc sigles
714# else {
715# my @new_sigle;
716#
717# my $prefix_check = 0;
718#
719# # Iterate over all sigle
720# foreach (@sigle) {
721#
722# # Sigle is a doc sigle
723# if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
724#
725# print "$_ ...";
726# # Check if a prefix is needed
727# unless ($prefix_check) {
728#
729# if ($prefix = $archive->check_prefix) {
730# print " with prefix ...";
731# };
732# $prefix_check = 1;
733# };
734#
735# print "\n";
736#
737# # TODO: Make this OS independent
738# my $path = ($prefix ? './' : '') . $_;
739#
740# print '... ' . (
741# $archive->extract_doc(
742# $path, $output, $sequential_extraction ? 1 : $jobs
743# ) ? '' : 'not '
744# );
745# print "extracted.\n";
746# }
747#
748# # Sigle is a text sigle
749# else {
750# push @new_sigle, $_;
751#
752# unless ($prefix_check) {
753#
754# if ($prefix = $archive->check_prefix) {
755# print " with prefix ...";
756# };
757# $prefix_check = 1;
758# };
759# };
760# };
761# @sigle = @new_sigle;
762# };
Akron03b24db2016-08-16 20:54:32 +0200763
Akrone10ad322016-02-27 10:54:26 +0100764 # Iterate over all given sigles and extract
765 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100766
Akron2812ba22016-10-28 21:55:59 +0200767 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200768
Akron03b24db2016-08-16 20:54:32 +0200769 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200770 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100771
Akron20807582016-10-26 17:11:34 +0200772 $archive->extract_text(
Akron651cb8d2016-08-16 21:44:49 +0200773 ($prefix ? './' : '') . $_, $output
774 ) ? '' : 'not '
775 );
Akrone10ad322016-02-27 10:54:26 +0100776 print "extracted.\n";
777 };
Akronb0c88db2016-06-29 16:33:18 +0200778 }
Akron7d4cdd82016-08-17 21:39:45 +0200779
780 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200781 else {
782 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200783 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100784 };
785}
786
Akron81500102017-04-07 20:45:44 +0200787
Akron941c1a62016-02-23 17:41:41 +0100788# Process an archive
789elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000790
Akron81500102017-04-07 20:45:44 +0200791 my $archive_output;
792
793 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100794 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200795
796 # Create new archive object
797 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
798
799 # Check zip capabilities
800 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200801 $log->error("Unzip is not installed or incompatible.");
802 exit 1;
Akron81500102017-04-07 20:45:44 +0200803 };
804
805 # Add further annotation archived
806 $archive->attach($_) foreach @input[1..$#input];
807
808 # Create a temporary directory
809 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200810 $extract_dir = tempdir(CLEANUP => 0);
811 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200812 };
813
Akron63f20d42017-04-10 23:40:29 +0200814 # Add some random extra to avoid clashes with multiple archives
815 $extract_dir = catdir($extract_dir, random_string('cccccc'));
816
Akron31a08cb2019-02-20 20:43:26 +0100817 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200818 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200819 @input = ($extract_dir);
820 }
821 else {
822 $log->error('Unable to extract from primary archive ' . $input[0] .
823 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200824 exit 1;
Akron81500102017-04-07 20:45:44 +0200825 };
826 }
827
828 # Can't create archive object
829 else {
830 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200831 exit 1;
Akron81500102017-04-07 20:45:44 +0200832 };
833 };
834
Akron7d4cdd82016-08-17 21:39:45 +0200835 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100836 my $pool = Parallel::ForkManager->new($jobs);
837
Akron7d4cdd82016-08-17 21:39:45 +0200838 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100839 my $iter = 1; # Current text in process
840
Akronda3097e2017-04-23 19:53:57 +0200841 my $tar_archive;
842 my $output_dir = $output;
843 my $tar_fh;
844
845 # Initialize tar archive
846 if ($to_tar) {
847 $tar_archive = Archive::Tar::Builder->new(
848 ignore_errors => 1
849 );
850
851 # Set output name
852 my $tar_file = $output;
853 unless ($tar_file =~ /\.tar$/) {
854 $tar_file .= '.tar';
855 };
856
857 # Initiate the tar file
858 print "Writing to file $tar_file\n";
859 $tar_fh = IO::File->new($tar_file, 'w');
860 $tar_fh->binmode(1);
861
862 # Set handle
863 $tar_archive->set_handle($tar_fh);
864
865 # Output to temporary directory
866 $output_dir = File::Temp->newdir;
867 };
868
Akron941c1a62016-02-23 17:41:41 +0100869 # Report on fork message
870 $pool->run_on_finish (
871 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200872 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100873 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200874
Akron08385f62016-03-22 20:37:04 +0100875 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200876 ($iter++) . "/$count]" .
877 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200878 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200879
880 if (!$code && $to_tar && $data->[2]) {
881 my $filename = $data->[2];
882
883 # Lock filehandle
884 if (flock($tar_fh, LOCK_EX)) {
885
Akron9a062ce2017-07-04 19:12:05 +0200886 my $clean_file = fileparse($filename);
887
Akronda3097e2017-04-23 19:53:57 +0200888 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200889 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200890 unlink $filename;
891
892 # Unlock filehandle
893 flock($tar_fh, LOCK_UN);
894 }
895 else {
896 $log->warn("Unable to add $filename to archive");
897 };
898 };
899
Akron4c0cf312016-10-15 16:42:09 +0200900 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100901 }
902 );
903
904 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200905 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100906 print "Reading data ...\n";
907
Akron7d4cdd82016-08-17 21:39:45 +0200908 # unless (Cache::FastMmap->new(
909 # share_file => $cache_file,
910 # cache_size => $cache_size,
911 # init_file => $cache_init
912 # )) {
913 # print "Unable to intialize cache '$cache_file'\n\n";
914 # exit(1);
915 # };
Akron11c80302016-03-18 19:44:43 +0100916
Akron486f9ab2017-04-22 23:25:19 +0200917
Akron941c1a62016-02-23 17:41:41 +0100918 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100919 if (-d $input[0]) {
920 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100921 my @dirs;
922 my $dir;
923
Akron7d4cdd82016-08-17 21:39:45 +0200924 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100925 while (1) {
926 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200927 push @dirs, $dir;
928 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100929 };
930 last unless $it->next;
931 };
932
933 print "Start processing ...\n";
934 $t = Benchmark->new;
935 $count = scalar @dirs;
936
937 DIRECTORY_LOOP:
938 for (my $i = 0; $i < $count; $i++) {
939
Akrone1dbc382016-07-08 22:24:52 +0200940 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200941 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200942 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200943 );
Akron941c1a62016-02-23 17:41:41 +0100944
945 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200946 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200947
Akron13d56622016-10-31 14:54:49 +0100948 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200949 $pool->finish(
950 0,
Akronda3097e2017-04-23 19:53:57 +0200951 [
952 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
953 undef,
954 $filename
955 ]
Akron486f9ab2017-04-22 23:25:19 +0200956 );
Akron3ec48972016-08-17 23:24:52 +0200957 }
958 else {
Akron4c0cf312016-10-15 16:42:09 +0200959 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200960 };
Akron941c1a62016-02-23 17:41:41 +0100961 };
962 }
963
964 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200965 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200966
Akron941c1a62016-02-23 17:41:41 +0100967 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200968 $log->error("Unzip is not installed or incompatible.");
969 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100970 };
971
Akron08385f62016-03-22 20:37:04 +0100972 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200973 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100974
Akron31a08cb2019-02-20 20:43:26 +0100975 # Get sigles to extract
976 my $prefix = set_sigle($archive);
977
Akron941c1a62016-02-23 17:41:41 +0100978 print "Start processing ...\n";
979 $t = Benchmark->new;
980 my @dirs = $archive->list_texts;
981 $count = scalar @dirs;
982
983 ARCHIVE_LOOP:
984 for (my $i = 0; $i < $count; $i++) {
985
986 # Split path information
987 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
988
Akrone1dbc382016-07-08 22:24:52 +0200989 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200990 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200991 get_file_name(
992 catfile($corpus, $doc, $text)
993 . '.json' . ($gzip ? '.gz' : '')
994 )
Akrone1dbc382016-07-08 22:24:52 +0200995 );
Akron941c1a62016-02-23 17:41:41 +0100996
997 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200998 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100999
Akron4c0cf312016-10-15 16:42:09 +02001000 # Create temporary file
1001 $temp = File::Temp->newdir;
1002
Akronbdf434a2016-10-24 17:42:07 +02001003 # TODO: Check if $filename exist at the beginning,
1004 # because extraction can be horrible slow!
1005
Akron941c1a62016-02-23 17:41:41 +01001006 # Extract from archive
Akron20807582016-10-26 17:11:34 +02001007 if ($archive->extract_text($dirs[$i], $temp)) {
Akron941c1a62016-02-23 17:41:41 +01001008
Akron7d4cdd82016-08-17 21:39:45 +02001009 # Create corpus directory
1010 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +01001011
Akron7d4cdd82016-08-17 21:39:45 +02001012 # Temporary directory
1013 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +01001014
Akron7d4cdd82016-08-17 21:39:45 +02001015 # Write file
Akron13d56622016-10-31 14:54:49 +01001016 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +02001017
Akron4c0cf312016-10-15 16:42:09 +02001018 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +01001019 $pool->finish(
1020 0,
Akronda3097e2017-04-23 19:53:57 +02001021 [
1022 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
1023 $temp,
1024 $filename
1025 ]
Akron13d56622016-10-31 14:54:49 +01001026 );
1027 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001028 }
1029 else {
Akron4c0cf312016-10-15 16:42:09 +02001030 # Delete temporary file
1031 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001032 };
Akron941c1a62016-02-23 17:41:41 +01001033 }
Akron7d4cdd82016-08-17 21:39:45 +02001034
1035 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001036 else {
Akron4c0cf312016-10-15 16:42:09 +02001037 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001038 };
1039 };
1040 }
1041
1042 else {
1043 print "Input is neither a directory nor an archive.\n\n";
1044 };
1045
1046 $pool->wait_all_children;
1047
Akron11c80302016-03-18 19:44:43 +01001048 # Delete cache file
1049 unlink($cache_file) if $cache_delete;
1050
Akronda3097e2017-04-23 19:53:57 +02001051 # Close tar filehandle
1052 if ($to_tar && $tar_fh) {
1053 $tar_archive->finish;
1054 $tar_fh->close;
1055 print "Wrote to tar archive.\n";
1056 };
1057
Akron63f20d42017-04-10 23:40:29 +02001058 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001059 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001060};
Akron941c1a62016-02-23 17:41:41 +01001061
Nils Diewald2db9ad02013-10-29 19:26:43 +00001062
Akron31a08cb2019-02-20 20:43:26 +01001063# For an archive, this will create the list
1064# of all sigles to process
1065sub set_sigle {
1066 my $archive = shift;
1067
1068 my $prefix = 1;
1069 my @dirs = ();
1070
1071 # No sigles given
1072 unless (@sigle) {
1073
1074 # Get files
1075 foreach ($archive->list_texts) {
1076
1077 push @dirs, $_;
1078
1079 # Split path information
1080 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1081
1082 # TODO: Make this OS independent
1083 push @sigle, join '/', $corpus, $doc, $text;
1084 };
1085 }
1086
1087 # Check sigle for doc sigles
1088 else {
1089 my @new_sigle;
1090
1091 my $prefix_check = 0;
1092
1093 # Iterate over all sigle
1094 foreach (@sigle) {
1095
1096 # Sigle is a doc sigle
1097 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1098
1099 print "$_ ...";
1100 # Check if a prefix is needed
1101 unless ($prefix_check) {
1102
1103 if ($prefix = $archive->check_prefix) {
1104 print " with prefix ...";
1105 };
1106 $prefix_check = 1;
1107 };
1108
1109 print "\n";
1110
1111 # TODO: Make this OS independent
1112 my $path = ($prefix ? './' : '') . $_;
1113
1114 print '... ' . (
1115 $archive->extract_doc(
1116 $path, $output, $sequential_extraction ? 1 : $jobs
1117 ) ? '' : 'not '
1118 );
1119 print "extracted.\n";
1120 }
1121
1122 # Sigle is a text sigle
1123 else {
1124 push @new_sigle, $_;
1125
1126 unless ($prefix_check) {
1127
1128 if ($prefix = $archive->check_prefix) {
1129 print " with prefix ...";
1130 };
1131 $prefix_check = 1;
1132 };
1133 };
1134 };
1135 @sigle = @new_sigle;
1136 };
1137
1138 return $prefix;
1139};
1140
1141
1142
Akron63f20d42017-04-10 23:40:29 +02001143# Cleanup temporary extraction directory
1144if ($extract_dir) {
1145 my $objects = remove_tree($extract_dir, { safe => 1 });
1146 print "Removed directory $extract_dir with $objects objects.\n";
1147};
1148
1149
1150print "\n";
1151
Nils Diewald2db9ad02013-10-29 19:26:43 +00001152__END__
Akron941c1a62016-02-23 17:41:41 +01001153
1154=pod
1155
1156=encoding utf8
1157
1158=head1 NAME
1159
Akronf7ad89e2016-03-16 18:22:47 +01001160korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001161
1162
1163=head1 SYNOPSIS
1164
Akrona76d8352016-10-27 16:27:32 +02001165 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001166
Akron2fd402b2016-10-27 21:26:48 +02001167
Akron941c1a62016-02-23 17:41:41 +01001168=head1 DESCRIPTION
1169
1170L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1171compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001172The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001173
1174
1175=head1 INSTALLATION
1176
1177The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1178
Akronaf386982016-10-12 00:33:25 +02001179 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001180
Akronc13a1702016-03-15 19:33:14 +01001181In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001182be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001183Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001184In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001185
1186=head1 ARGUMENTS
1187
Akrona76d8352016-10-27 16:27:32 +02001188 $ korapxml2krill -z --input <directory> --output <filename>
1189
1190Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001191It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001192
Akron941c1a62016-02-23 17:41:41 +01001193=over 2
1194
1195=item B<archive>
1196
Akron081639e2017-04-21 19:01:39 +02001197 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001198
Akron2fd402b2016-10-27 21:26:48 +02001199Converts an archive of KorAP-XML documents. It expects a directory
1200(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001201
1202=item B<extract>
1203
Akrona76d8352016-10-27 16:27:32 +02001204 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1205
1206Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001207
Akron63f20d42017-04-10 23:40:29 +02001208=item B<serial>
1209
1210 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1211
1212Convert archives sequentially. The inputs are not merged but treated
1213as they are (so they may be premerged or globs).
1214the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001215are created based on the archive name. In case the C<--to-tar> flag is given,
1216the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001217
1218
Akron941c1a62016-02-23 17:41:41 +01001219=back
1220
1221
1222=head1 OPTIONS
1223
1224=over 2
1225
Akrona76d8352016-10-27 16:27:32 +02001226=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001227
Akrona76d8352016-10-27 16:27:32 +02001228Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001229
Akron7606afa2016-10-25 16:23:49 +02001230Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001231document, while C<archive> expects a KorAP-XML corpus folder or a zip
1232file to batch process multiple files.
1233C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001234
Akrona76d8352016-10-27 16:27:32 +02001235C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001236that the first archive listed contains all primary data files
1237and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001238
Akron7606afa2016-10-25 16:23:49 +02001239 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001240
Akron821db3d2017-04-06 21:19:31 +02001241Input may also be defined using BSD glob wildcards.
1242
1243 -i 'file/news*.zip'
1244
1245The extended input array will be sorted in length order, so the shortest
1246path needs to contain all primary data files and all meta data files.
1247
Akron0c3e3752016-06-28 15:55:53 +02001248(The directory structure follows the base directory format,
1249that may include a C<.> root folder.
1250In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001251need to be passed with a hash sign in front of the archive's name.
1252This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001253
Akron7606afa2016-10-25 16:23:49 +02001254To support zip files, a version of C<unzip> needs to be installed that is
1255compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001256
Akron7606afa2016-10-25 16:23:49 +02001257B<The root folder switch using the hash sign is experimental and
1258may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001259
Akronf73ffb62018-06-27 12:13:59 +02001260
Akron63f20d42017-04-10 23:40:29 +02001261=item B<--input-base|-ib> <directory>
1262
1263The base directory for inputs.
1264
1265
Akron941c1a62016-02-23 17:41:41 +01001266=item B<--output|-o> <directory|file>
1267
1268Output folder for archive processing or
1269document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001270writes to C<STDOUT> by default
1271(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001272
1273=item B<--overwrite|-w>
1274
1275Overwrite files that already exist.
1276
Akronf73ffb62018-06-27 12:13:59 +02001277
Akron3741f8b2016-12-21 19:55:21 +01001278=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001279
1280Define the default tokenization by specifying
1281the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001282of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001283
Akron3741f8b2016-12-21 19:55:21 +01001284
1285=item B<--base-sentences|-bs> <foundry>#<layer>
1286
1287Define the layer for base sentences.
1288If given, this will be used instead of using C<Base#Sentences>.
1289Currently C<DeReKo#Structure> is the only additional layer supported.
1290
1291 Defaults to unset.
1292
1293
1294=item B<--base-paragraphs|-bp> <foundry>#<layer>
1295
1296Define the layer for base paragraphs.
1297If given, this will be used instead of using C<Base#Paragraphs>.
1298Currently C<DeReKo#Structure> is the only additional layer supported.
1299
1300 Defaults to unset.
1301
1302
Akron41ac10b2017-02-08 22:47:25 +01001303=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1304
1305Define the layer for base pagebreaks.
1306Currently C<DeReKo#Structure> is the only layer supported.
1307
1308 Defaults to unset.
1309
1310
Akron941c1a62016-02-23 17:41:41 +01001311=item B<--skip|-s> <foundry>[#<layer>]
1312
Akronf7ad89e2016-03-16 18:22:47 +01001313Skip specific annotations by specifying the foundry
1314(and optionally the layer with a C<#>-prefix),
1315e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001316Can be set multiple times.
1317
Akronf73ffb62018-06-27 12:13:59 +02001318
Akronc13a1702016-03-15 19:33:14 +01001319=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001320
Akronf7ad89e2016-03-16 18:22:47 +01001321Convert specific annotations by specifying the foundry
1322(and optionally the layer with a C<#>-prefix),
1323e.g. C<Mate> or C<Mate#Morpho>.
1324Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001325
Akronf73ffb62018-06-27 12:13:59 +02001326
Akron941c1a62016-02-23 17:41:41 +01001327=item B<--primary|-p>
1328
Akronc13a1702016-03-15 19:33:14 +01001329Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001330Can be flagged using C<--no-primary> as well.
1331This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001332
Akronf73ffb62018-06-27 12:13:59 +02001333
Akroned9baf02019-01-22 17:03:25 +01001334=item B<--non-word-tokens|-nwt>
1335
1336Tokenize non-word tokens like word tokens (defined as matching
1337C</[\d\w]/>). Useful to treat punctuations as tokens.
1338
1339 Defaults to unset.
1340
Akron941c1a62016-02-23 17:41:41 +01001341=item B<--jobs|-j>
1342
1343Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001344for archive processing.
Akron11c80302016-03-18 19:44:43 +01001345Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001346
1347If C<sequential-extraction> is not set to false, this will
1348also apply to extraction.
1349
Akronc11f7982017-02-21 21:20:14 +01001350Pass -1, and the value will be set automatically to 5
1351times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001352This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001353
Akronf73ffb62018-06-27 12:13:59 +02001354
Akron263274c2019-02-07 09:48:30 +01001355=item B<--koral|-k>
1356
1357Version of the output format. Supported versions are:
1358C<0> for legacy serialization, C<0.03> for serialization
1359with metadata fields as key-values on the root object,
1360C<0.4> for serialization with metadata fields as a list
1361of C<"@type":"koral:field"> objects.
1362
1363Currently defaults to C<0.03>.
1364
1365
Akron9ec88872017-04-12 16:29:06 +02001366=item B<--sequential-extraction|-se>
1367
1368Flag to indicate, if the C<jobs> value also applies to extraction.
1369Some systems may have problems with extracting multiple archives
1370to the same folder at the same time.
1371Can be flagged using C<--no-sequential-extraction> as well.
1372Defaults to C<false>.
1373
Akronf73ffb62018-06-27 12:13:59 +02001374
Akron35db6e32016-03-17 22:42:22 +01001375=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001376
Akron35db6e32016-03-17 22:42:22 +01001377Define the metadata parser to use. Defaults to C<I5>.
1378Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1379This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001380
Akronf73ffb62018-06-27 12:13:59 +02001381
Akron941c1a62016-02-23 17:41:41 +01001382=item B<--pretty|-y>
1383
Akronc13a1702016-03-15 19:33:14 +01001384Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001385This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001386
Akronf73ffb62018-06-27 12:13:59 +02001387
Akron941c1a62016-02-23 17:41:41 +01001388=item B<--gzip|-z>
1389
Akronf7ad89e2016-03-16 18:22:47 +01001390Compress the output.
1391Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001392
Akronf73ffb62018-06-27 12:13:59 +02001393
Akron11c80302016-03-18 19:44:43 +01001394=item B<--cache|-c>
1395
1396File to mmap a cache (using L<Cache::FastMmap>).
1397Defaults to C<korapxml2krill.cache> in the calling directory.
1398
Akronf73ffb62018-06-27 12:13:59 +02001399
Akron11c80302016-03-18 19:44:43 +01001400=item B<--cache-size|-cs>
1401
1402Size of the cache. Defaults to C<50m>.
1403
Akronf73ffb62018-06-27 12:13:59 +02001404
Akron11c80302016-03-18 19:44:43 +01001405=item B<--cache-init|-ci>
1406
1407Initialize cache file.
1408Can be flagged using C<--no-cache-init> as well.
1409Defaults to C<true>.
1410
Akronf73ffb62018-06-27 12:13:59 +02001411
Akron11c80302016-03-18 19:44:43 +01001412=item B<--cache-delete|-cd>
1413
1414Delete cache file after processing.
1415Can be flagged using C<--no-cache-delete> as well.
1416Defaults to C<true>.
1417
Akronf73ffb62018-06-27 12:13:59 +02001418
Akron636aa112017-04-07 18:48:56 +02001419=item B<--config|-cfg>
1420
1421Configure the parameters of your call in a file
1422of key-value pairs with whitespace separator
1423
1424 overwrite 1
1425 token DeReKo#Structure
1426 ...
1427
1428Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001429C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001430C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001431C<output>, C<koral>,
1432C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001433C<base-sentences>, C<base-paragraphs>,
1434C<base-pagebreaks>,
1435C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001436(semicolon separated), C<anno> (semicolon separated).
1437
Akronf73ffb62018-06-27 12:13:59 +02001438Configuration parameters will always be overwritten by
1439passed parameters.
1440
1441
Akron81500102017-04-07 20:45:44 +02001442=item B<--temporary-extract|-te>
1443
1444Only valid for the C<archive> command.
1445
1446This will first extract all files into a
1447directory and then will archive.
1448If the directory is given as C<:temp:>,
1449a temporary directory is used.
1450This is especially useful to avoid
1451massive unzipping and potential
1452network latency.
Akron636aa112017-04-07 18:48:56 +02001453
Akronf73ffb62018-06-27 12:13:59 +02001454
Akrone10ad322016-02-27 10:54:26 +01001455=item B<--sigle|-sg>
1456
Akron20807582016-10-26 17:11:34 +02001457Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001458Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001459I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001460Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001461In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001462On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001463
Akronf73ffb62018-06-27 12:13:59 +02001464
Akron941c1a62016-02-23 17:41:41 +01001465=item B<--log|-l>
1466
1467The L<Log4perl> log level, defaults to C<ERROR>.
1468
Akronf73ffb62018-06-27 12:13:59 +02001469
Akron941c1a62016-02-23 17:41:41 +01001470=item B<--help|-h>
1471
1472Print this document.
1473
Akronf73ffb62018-06-27 12:13:59 +02001474
Akron941c1a62016-02-23 17:41:41 +01001475=item B<--version|-v>
1476
1477Print version information.
1478
1479=back
1480
Akronf73ffb62018-06-27 12:13:59 +02001481
Akronc13a1702016-03-15 19:33:14 +01001482=head1 ANNOTATION SUPPORT
1483
1484L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1485developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1486The base foundry with paragraphs, sentences, and the text element are mandatory for
1487L<Krill|https://github.com/KorAP/Krill>.
1488
Akron821db3d2017-04-06 21:19:31 +02001489 Base
1490 #Paragraphs
1491 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001492
Akron821db3d2017-04-06 21:19:31 +02001493 Connexor
1494 #Morpho
1495 #Phrase
1496 #Sentences
1497 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001498
Akron821db3d2017-04-06 21:19:31 +02001499 CoreNLP
1500 #Constituency
1501 #Morpho
1502 #NamedEntities
1503 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001504
Akronce125b62017-06-19 11:54:36 +02001505 CMC
1506 #Morpho
1507
Akron821db3d2017-04-06 21:19:31 +02001508 DeReKo
1509 #Structure
Akronc13a1702016-03-15 19:33:14 +01001510
Akron821db3d2017-04-06 21:19:31 +02001511 DRuKoLa
1512 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001513
Akron821db3d2017-04-06 21:19:31 +02001514 Glemm
1515 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001516
Akronea1aed52018-07-19 14:43:34 +02001517 HNC
1518 #Morpho
1519
Akron4c679192018-01-16 17:41:49 +01001520 LWC
1521 #Dependency
1522
Akron821db3d2017-04-06 21:19:31 +02001523 Malt
1524 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001525
Akron821db3d2017-04-06 21:19:31 +02001526 MarMoT
1527 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001528
Akron821db3d2017-04-06 21:19:31 +02001529 Mate
1530 #Dependency
1531 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001532
Akron821db3d2017-04-06 21:19:31 +02001533 MDParser
1534 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001535
Akron821db3d2017-04-06 21:19:31 +02001536 OpenNLP
1537 #Morpho
1538 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001539
Akron821db3d2017-04-06 21:19:31 +02001540 Sgbr
1541 #Lemma
1542 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001543
Akron821db3d2017-04-06 21:19:31 +02001544 TreeTagger
1545 #Morpho
1546 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001547
Akron821db3d2017-04-06 21:19:31 +02001548 XIP
1549 #Constituency
1550 #Morpho
1551 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001552
Akronc13a1702016-03-15 19:33:14 +01001553
1554More importers are in preparation.
1555New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1556See the built-in annotation importers as examples.
1557
Akronf73ffb62018-06-27 12:13:59 +02001558
Akron941c1a62016-02-23 17:41:41 +01001559=head1 AVAILABILITY
1560
1561 https://github.com/KorAP/KorAP-XML-Krill
1562
1563
1564=head1 COPYRIGHT AND LICENSE
1565
Akroned9baf02019-01-22 17:03:25 +01001566Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001567
Akron941c1a62016-02-23 17:41:41 +01001568Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001569
Akrona76d8352016-10-27 16:27:32 +02001570Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001571
1572L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1573Corpus Analysis Platform at the
1574L<Institute for the German Language (IDS)|http://ids-mannheim.de/>,
1575member of the
1576L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1577
1578This program is free software published under the
1579L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1580
1581=cut