blob: e6754d9d11ef3af15d3b86df0b9094a3b26ea8ba [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
146# 2019/12/16
147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akron941c1a62016-02-23 17:41:41 +0100149# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100150
Akronc29b8e12019-12-16 14:28:09 +0100151our $LAST_CHANGE = '2019/12/16';
Akron941c1a62016-02-23 17:41:41 +0100152our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100153our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100154our $VERSION_MSG = <<"VERSION";
155Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
156VERSION
157
Akron63f20d42017-04-10 23:40:29 +0200158# Prototypes
159sub get_file_name_from_glob($);
160sub get_file_name($);
161
Akron941c1a62016-02-23 17:41:41 +0100162# Parse comand
163my $cmd;
164our @ARGV;
165if ($ARGV[0] && index($ARGV[0], '-') != 0) {
166 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100167};
Akron63f20d42017-04-10 23:40:29 +0200168my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100169
Akron5f51d422016-08-16 16:26:43 +0200170my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100171my $text;
Akrone10ad322016-02-27 10:54:26 +0100172
Akron941c1a62016-02-23 17:41:41 +0100173# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000174GetOptions(
Akron08385f62016-03-22 20:37:04 +0100175 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200176 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100177 'output|o=s' => \(my $output),
178 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100179 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200180 'token|t=s' => \(my $token_base),
181 'base-sentences|bs=s' => \(my $base_sentences),
182 'base-paragraphs|bp=s' => \(my $base_paragraphs),
183 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100184 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200185 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100186 'skip|s=s' => \@skip,
187 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200188 'cache|c=s' => \(my $cache_file),
189 'config|cfg=s' => \(my $cfg_file),
190 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200191 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100192 'primary|p!' => \(my $primary),
193 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200194 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100195 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200196 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100197 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200198 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200199 'cache-size|cs=s' => \(my $cache_size),
200 'cache-delete|cd!' => \(my $cache_delete),
201 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100202 'help|h' => sub {
203 pod2usage(
204 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200205 -verbose => 99,
206 -msg => $VERSION_MSG,
207 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100208 );
209 },
210 'version|v' => sub {
211 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200212 -verbose => 0,
213 -msg => $VERSION_MSG,
214 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100215 )
216 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000217);
218
Akron63f20d42017-04-10 23:40:29 +0200219
Akron636aa112017-04-07 18:48:56 +0200220# Load from configuration
221if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200222 my %config;
223
224 Config::Simple->import_from($cfg_file, \%config);
225
226 # Overwrite
227 if (!defined($overwrite) && defined $config{overwrite}) {
228 $overwrite = $config{overwrite};
229 };
230
231 # Gzip
232 if (!defined($gzip) && defined $config{gzip}) {
233 $gzip = $config{gzip};
234 };
235
236 # Jobs
237 if (!defined($jobs) && defined $config{jobs}) {
238 $jobs = $config{jobs};
239 };
240
Akron263274c2019-02-07 09:48:30 +0100241 # Koral version
242 if (!defined($koral) && defined $config{koral}) {
243 $koral = $config{koral};
244 };
245
Akron63f20d42017-04-10 23:40:29 +0200246 # Input root base directory
247 if (!defined($input_base) && defined $config{'input-base'}) {
248 $input_base = $config{'input-base'};
249 };
250
Akron81500102017-04-07 20:45:44 +0200251 # temporary-extract
252 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
253 $extract_dir = $config{'temporary-extract'};
254 };
255
Akron636aa112017-04-07 18:48:56 +0200256 # Token base
257 if (!defined($token_base) && defined $config{token}) {
258 $token_base = $config{token};
259 };
260
Akroned9baf02019-01-22 17:03:25 +0100261 # temporary-extract
262 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
263 $non_word_tokens = $config{'non-word-tokens'};
264 };
265
Akron636aa112017-04-07 18:48:56 +0200266 # Cache file
267 if (!defined($cache_file) && defined $config{cache}) {
268 $cache_file = $config{cache};
269 };
270
271 # Cache size
272 if (!defined($cache_size) && defined $config{'cache-size'}) {
273 $cache_size = $config{'cache-size'};
274 };
275
276 # Cache delete
277 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
278 $cache_delete = $config{'cache-delete'} ;
279 };
280
281 # Cache init
282 if (!(defined $cache_init) && defined $config{'cache-init'}) {
283 $cache_init = $config{'cache-init'} ;
284 };
285
Akron9ec88872017-04-12 16:29:06 +0200286 # Jobs for extraction
287 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
288 $sequential_extraction = $config{'sequential-extraction'} ;
289 };
290
Akron636aa112017-04-07 18:48:56 +0200291 # Meta
292 if (!(defined $meta) && defined $config{'meta'}) {
293 $meta = $config{'meta'} ;
294 };
295
296 # Output
297 if (!(defined $output) && defined $config{'output'}) {
298 $output = $config{'output'} ;
299 };
300
301 # Base-sentences
302 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
303 $base_sentences = $config{'base-sentences'} ;
304 };
305
306 # Base-paragraphs
307 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
308 $base_paragraphs = $config{'base-paragraphs'} ;
309 };
310
311 # Base-pagebreaks
312 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
313 $base_pagebreaks = $config{'base-pagebreaks'} ;
314 };
315
Akron081639e2017-04-21 19:01:39 +0200316 # Write to tar
317 if (!(defined $to_tar) && defined $config{'to-tar'}) {
318 $to_tar = $config{'to-tar'} ;
319 };
320
Akron636aa112017-04-07 18:48:56 +0200321 # Log
322 if (!(defined $log_level) && defined $config{'log'}) {
323 $log_level = $config{'log'} ;
324 };
325
326 # Skip
327 if (!scalar(@skip) && defined $config{'skip'}) {
328 @skip = split /\s*;\s*/, $config{'skip'} ;
329 };
330
331 # Sigle
332 if (!scalar(@sigle) && defined $config{'sigle'}) {
333 @sigle = split /\s*;\s*/, $config{'sigle'} ;
334 };
335
336 # Anno
337 if (!scalar(@anno) && defined $config{'anno'}) {
338 @anno = split /\s*;\s*/, $config{'anno'} ;
339 };
340};
341
Akron63f20d42017-04-10 23:40:29 +0200342
Akron636aa112017-04-07 18:48:56 +0200343# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200344$token_base //= 'OpenNLP#tokens';
345$cache_file //= 'korapxml2krill.cache';
346$cache_size //= '50m';
347$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100348$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200349$cache_delete //= 1;
350$cache_init //= 1;
351$sequential_extraction //= 0;
352$log_level //= 'ERROR';
353$base_sentences //= '';
354$base_paragraphs //= '';
355$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100356$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200357
Akron821db3d2017-04-06 21:19:31 +0200358$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100359$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100360$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100361
Akron63f20d42017-04-10 23:40:29 +0200362
363# Initialize log4perl object
364Log::Log4perl->init({
365 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
366 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
367 'log4perl.appender.STDERR.layout' => 'PatternLayout',
368 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
369});
370
371my $log = Log::Log4perl->get_logger('main');
372
373
374print "Reading config from $cfg_file\n" if $cfg_file;
375
376
Akron941c1a62016-02-23 17:41:41 +0100377my %ERROR_HASH = (
378 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200379 -verbose => 99,
380 -msg => $VERSION_MSG,
381 -output => '-',
382 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100383);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000384
Akron941c1a62016-02-23 17:41:41 +0100385# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100386pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000387
Akrone1dbc382016-07-08 22:24:52 +0200388# Gzip has no effect, if no output is given
389pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000390
Akronc11f7982017-02-21 21:20:14 +0100391
Akron636aa112017-04-07 18:48:56 +0200392if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100393 state $cores = Sys::Info->new->device('CPU')->count;
394 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200395 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100396};
397
Akron821db3d2017-04-06 21:19:31 +0200398
Akron63f20d42017-04-10 23:40:29 +0200399# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200400if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200401
Akron486f9ab2017-04-22 23:25:19 +0200402 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200403 $log->error("Directory '$output' does not exist.");
404 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200405 };
406
407 # Remove all inputs
408 my $remove_next = 0;
409 @keep_argv = @{c(@keep_argv)->grep(
410 sub {
411 # Input flag
412 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
413 $remove_next = 1;
414 return 0;
415 }
416
417 # input value
418 elsif ($remove_next) {
419 $remove_next = 0;
420 return 0;
421 };
422
423 # Pass parameter
424 return 1;
425 }
426 )->to_array};
427
428
429 # Iterate over all inputs
430 foreach (@input) {
431
Akron081639e2017-04-21 19:01:39 +0200432 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200433 my $new_out = catdir($output, get_file_name_from_glob($_));
434
Akron486f9ab2017-04-22 23:25:19 +0200435 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200436 unless ($to_tar) {
437 if (make_path($new_out) == 0 && !-d $new_out) {
438 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200439 exit 1;
Akron081639e2017-04-21 19:01:39 +0200440 };
Akron63f20d42017-04-10 23:40:29 +0200441 };
442
443 # Create archive command
444 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
445 print "Start serial processing of $_ to $new_out\n";
446
447 # Start archiving
448 system @archive_cmd;
449 };
450
Akron3abc03e2017-06-29 16:23:35 +0200451 exit;
Akron63f20d42017-04-10 23:40:29 +0200452};
453
Akrone1dbc382016-07-08 22:24:52 +0200454my %skip;
455$skip{lc($_)} = 1 foreach @skip;
456
457my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100458push(@layers, ['Base', 'Sentences']) unless $base_sentences;
459push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200460
461# Connexor
462push(@layers, ['Connexor', 'Morpho']);
463push(@layers, ['Connexor', 'Syntax']);
464push(@layers, ['Connexor', 'Phrase']);
465push(@layers, ['Connexor', 'Sentences']);
466
467# CoreNLP
468push(@layers, ['CoreNLP', 'NamedEntities']);
469push(@layers, ['CoreNLP', 'Sentences']);
470push(@layers, ['CoreNLP', 'Morpho']);
471push(@layers, ['CoreNLP', 'Constituency']);
472
Akronce125b62017-06-19 11:54:36 +0200473# CMC
474push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100475
Akrone1dbc382016-07-08 22:24:52 +0200476# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100477my @dereko_attr = ();
478if ($base_sentences eq 'dereko#structure') {
479 push @dereko_attr, 'sentences';
480};
481if ($base_paragraphs eq 'dereko#structure') {
482 push @dereko_attr, 'paragraphs';
483};
Akron636bd9c2017-02-09 17:13:00 +0100484
Akron41ac10b2017-02-08 22:47:25 +0100485if ($base_pagebreaks eq 'dereko#structure') {
486 push @dereko_attr, 'pagebreaks';
487};
488
489if ($dereko_attr[0]) {
490 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100491}
492else {
493 push(@layers, ['DeReKo', 'Structure']);
494};
Akrone1dbc382016-07-08 22:24:52 +0200495
Akron57510c12019-01-04 14:58:53 +0100496# DGD
497push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100498if ($base_sentences eq 'dgd#structure') {
499 push(@layers, ['DGD', 'Structure', 'base-sentence']);
500}
Akron57510c12019-01-04 14:58:53 +0100501
502# DRuKoLa
503push(@layers, ['DRuKoLa', 'Morpho']);
504
Akrone1dbc382016-07-08 22:24:52 +0200505# Glemm
506push(@layers, ['Glemm', 'Morpho']);
507
Akronea1aed52018-07-19 14:43:34 +0200508# HNC
509push(@layers, ['HNC', 'Morpho']);
510
Akron4c679192018-01-16 17:41:49 +0100511# LWC
512push(@layers, ['LWC', 'Dependency']);
513
Akrone1dbc382016-07-08 22:24:52 +0200514# Malt
515push(@layers, ['Malt', 'Dependency']);
516
Akron57510c12019-01-04 14:58:53 +0100517# Marmot
518push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200519
520# Mate
521push(@layers, ['Mate', 'Morpho']);
522push(@layers, ['Mate', 'Dependency']);
523
Akron57510c12019-01-04 14:58:53 +0100524# MDParser
525push(@layers, ['MDParser', 'Dependency']);
526
Akrone1dbc382016-07-08 22:24:52 +0200527# OpenNLP
528push(@layers, ['OpenNLP', 'Morpho']);
529push(@layers, ['OpenNLP', 'Sentences']);
530
531# Schreibgebrauch
532push(@layers, ['Sgbr', 'Lemma']);
533push(@layers, ['Sgbr', 'Morpho']);
534
Akron7d5e6382019-08-08 16:36:27 +0200535# Talismane
536push(@layers, ['Talismane', 'Dependency']);
537push(@layers, ['Talismane', 'Morpho']);
538
Akrone1dbc382016-07-08 22:24:52 +0200539# TreeTagger
540push(@layers, ['TreeTagger', 'Morpho']);
541push(@layers, ['TreeTagger', 'Sentences']);
542
543# XIP
544push(@layers, ['XIP', 'Morpho']);
545push(@layers, ['XIP', 'Constituency']);
546push(@layers, ['XIP', 'Sentences']);
547push(@layers, ['XIP', 'Dependency']);
548
Akron4fa37c32017-01-20 14:43:10 +0100549
Akrone1dbc382016-07-08 22:24:52 +0200550# Check filters
551my @filtered_anno;
552if ($skip{'#all'}) {
553 foreach (@anno) {
554 push @filtered_anno, [ split('#', $_) ];
555 };
556}
557
558# Add all annotations that are not skipped
559else {
560 # Add to index file - respect skipping
561 foreach my $info (@layers) {
562 # Skip if Foundry or Foundry#Layer should be skipped
563 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
564 push @filtered_anno, $info;
565 };
566 };
567};
568
569# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200570my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
571
572# Remove file extension
573$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200574
575# TODO: This should not be initialized for batch
576my $cache = Cache::FastMmap->new(
577 share_file => $cache_file,
578 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200579 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200580);
581
Akron03b24db2016-08-16 20:54:32 +0200582# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200583my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200584 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200585 meta_type => $meta,
586 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200587 foundry => $token_base_foundry,
588 layer => $token_base_layer,
589 gzip => $gzip,
590 log => $log,
Akron263274c2019-02-07 09:48:30 +0100591 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200592 primary => $primary,
593 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100594 anno => \@filtered_anno,
595 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200596);
597
Akron941c1a62016-02-23 17:41:41 +0100598# Get file name based on path information
599sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100600 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200601 if (-d $i) {
602 $i =~ s![^\/]+$!!;
603 };
Akron941c1a62016-02-23 17:41:41 +0100604 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200605
606 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200607 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100608 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100609 $file =~ tr/\//-/;
610 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200611 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100612 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000613};
614
Akron63f20d42017-04-10 23:40:29 +0200615
616sub get_file_name_from_glob ($) {
617 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200618 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200619 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
620 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
621 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
622 $glob =~ s/^-//; # Clean beginning
623 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200624 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200625 return $glob;
626};
627
628
Akrone10ad322016-02-27 10:54:26 +0100629# Convert sigle to path construct
630s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
631
Akron7d4cdd82016-08-17 21:39:45 +0200632if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200633 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200634 $log->error("Directory '$output' does not exist.");
635 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200636 };
637};
638
Akron63f20d42017-04-10 23:40:29 +0200639
640# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200641if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200642
Akron821db3d2017-04-06 21:19:31 +0200643 my @new_input = ();
644
645 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200646 foreach my $wild_card (@input) {
647
648 # Prefix with input root
649 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
650
651 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200652 };
653
Akron63f20d42017-04-10 23:40:29 +0200654 # Sort files by length
655 @input = sort { length($a) <=> length($b) } @new_input;
656
657 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200658};
659
660
Akron941c1a62016-02-23 17:41:41 +0100661# Process a single file
662unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100663 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000664
Akron941c1a62016-02-23 17:41:41 +0100665 BEGIN {
666 $main::TIME = Benchmark->new;
667 $main::LAST_STOP = Benchmark->new;
668 };
669
670 sub stop_time {
671 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200672 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100673 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200674 timestr(timediff($new, $main::LAST_STOP)) .
675 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
676 );
Akron941c1a62016-02-23 17:41:41 +0100677 $main::LAST_STOP = $new;
678 };
679
680 # Create and parse new document
681 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100682
Akron7d4cdd82016-08-17 21:39:45 +0200683 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200684 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100685
Akron11c80302016-03-18 19:44:43 +0100686 # Delete cache file
687 unlink($cache_file) if $cache_delete;
688
Akron5f51d422016-08-16 16:26:43 +0200689 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200690 exit;
Akron81500102017-04-07 20:45:44 +0200691};
692
Nils Diewald59094f22014-11-05 18:20:50 +0000693
Akrone10ad322016-02-27 10:54:26 +0100694# Extract XML files
Akron81500102017-04-07 20:45:44 +0200695if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100696
Akrond5643ad2017-07-04 20:27:13 +0200697 # Output is required
698 pod2usage(%ERROR_HASH) unless $output;
699
Akron7d4cdd82016-08-17 21:39:45 +0200700 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200701 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100702
Akron7d4cdd82016-08-17 21:39:45 +0200703 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100704 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200705 $log->error("Unzip is not installed or incompatible.");
706 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100707 };
708
Akronb0c88db2016-06-29 16:33:18 +0200709 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200710 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200711
Akron31a08cb2019-02-20 20:43:26 +0100712 # Will set @sigle
713 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200714
Akrone10ad322016-02-27 10:54:26 +0100715 # Iterate over all given sigles and extract
716 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100717
Akron2812ba22016-10-28 21:55:59 +0200718 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200719
Akron03b24db2016-08-16 20:54:32 +0200720 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200721 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100722
Akron955b75b2019-02-21 14:28:41 +0100723 # TODO:
724 # - prefix???
725 $archive->extract_sigle([$_], $output, $jobs)
726 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200727 );
Akrone10ad322016-02-27 10:54:26 +0100728 print "extracted.\n";
729 };
Akronb0c88db2016-06-29 16:33:18 +0200730 }
Akron7d4cdd82016-08-17 21:39:45 +0200731
732 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200733 else {
734 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200735 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100736 };
737}
738
Akron81500102017-04-07 20:45:44 +0200739
Akron941c1a62016-02-23 17:41:41 +0100740# Process an archive
741elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000742
Akron81500102017-04-07 20:45:44 +0200743 my $archive_output;
744
745 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100746 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200747
748 # Create new archive object
749 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
750
751 # Check zip capabilities
752 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200753 $log->error("Unzip is not installed or incompatible.");
754 exit 1;
Akron81500102017-04-07 20:45:44 +0200755 };
756
757 # Add further annotation archived
758 $archive->attach($_) foreach @input[1..$#input];
759
760 # Create a temporary directory
761 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200762 $extract_dir = tempdir(CLEANUP => 0);
763 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200764 };
765
Akron63f20d42017-04-10 23:40:29 +0200766 # Add some random extra to avoid clashes with multiple archives
767 $extract_dir = catdir($extract_dir, random_string('cccccc'));
768
Akron31a08cb2019-02-20 20:43:26 +0100769 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200770 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200771 @input = ($extract_dir);
772 }
773 else {
774 $log->error('Unable to extract from primary archive ' . $input[0] .
775 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200776 exit 1;
Akron81500102017-04-07 20:45:44 +0200777 };
778 }
779
780 # Can't create archive object
781 else {
782 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200783 exit 1;
Akron81500102017-04-07 20:45:44 +0200784 };
785 };
786
Akron7d4cdd82016-08-17 21:39:45 +0200787 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100788 my $pool = Parallel::ForkManager->new($jobs);
789
Akron7d4cdd82016-08-17 21:39:45 +0200790 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100791 my $iter = 1; # Current text in process
792
Akronda3097e2017-04-23 19:53:57 +0200793 my $tar_archive;
794 my $output_dir = $output;
795 my $tar_fh;
796
797 # Initialize tar archive
798 if ($to_tar) {
799 $tar_archive = Archive::Tar::Builder->new(
800 ignore_errors => 1
801 );
802
803 # Set output name
804 my $tar_file = $output;
805 unless ($tar_file =~ /\.tar$/) {
806 $tar_file .= '.tar';
807 };
808
809 # Initiate the tar file
810 print "Writing to file $tar_file\n";
811 $tar_fh = IO::File->new($tar_file, 'w');
812 $tar_fh->binmode(1);
813
814 # Set handle
815 $tar_archive->set_handle($tar_fh);
816
817 # Output to temporary directory
818 $output_dir = File::Temp->newdir;
819 };
820
Akron941c1a62016-02-23 17:41:41 +0100821 # Report on fork message
822 $pool->run_on_finish (
823 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200824 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100825 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200826
Akron08385f62016-03-22 20:37:04 +0100827 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200828 ($iter++) . "/$count]" .
829 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200830 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200831
832 if (!$code && $to_tar && $data->[2]) {
833 my $filename = $data->[2];
834
835 # Lock filehandle
836 if (flock($tar_fh, LOCK_EX)) {
837
Akron9a062ce2017-07-04 19:12:05 +0200838 my $clean_file = fileparse($filename);
839
Akronda3097e2017-04-23 19:53:57 +0200840 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200841 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200842 unlink $filename;
843
844 # Unlock filehandle
845 flock($tar_fh, LOCK_UN);
846 }
847 else {
848 $log->warn("Unable to add $filename to archive");
849 };
850 };
851
Akron4c0cf312016-10-15 16:42:09 +0200852 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100853 }
854 );
855
856 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200857 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100858 print "Reading data ...\n";
859
Akron7d4cdd82016-08-17 21:39:45 +0200860 # unless (Cache::FastMmap->new(
861 # share_file => $cache_file,
862 # cache_size => $cache_size,
863 # init_file => $cache_init
864 # )) {
865 # print "Unable to intialize cache '$cache_file'\n\n";
866 # exit(1);
867 # };
Akron11c80302016-03-18 19:44:43 +0100868
Akron486f9ab2017-04-22 23:25:19 +0200869
Akron941c1a62016-02-23 17:41:41 +0100870 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100871 if (-d $input[0]) {
872 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100873 my @dirs;
874 my $dir;
875
Akron7d4cdd82016-08-17 21:39:45 +0200876 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100877 while (1) {
878 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200879 push @dirs, $dir;
880 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100881 };
882 last unless $it->next;
883 };
884
885 print "Start processing ...\n";
886 $t = Benchmark->new;
887 $count = scalar @dirs;
888
889 DIRECTORY_LOOP:
890 for (my $i = 0; $i < $count; $i++) {
891
Akrone1dbc382016-07-08 22:24:52 +0200892 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200893 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200894 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200895 );
Akron941c1a62016-02-23 17:41:41 +0100896
897 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200898 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200899
Akron13d56622016-10-31 14:54:49 +0100900 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200901 $pool->finish(
902 0,
Akronda3097e2017-04-23 19:53:57 +0200903 [
904 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
905 undef,
906 $filename
907 ]
Akron486f9ab2017-04-22 23:25:19 +0200908 );
Akron3ec48972016-08-17 23:24:52 +0200909 }
910 else {
Akron4c0cf312016-10-15 16:42:09 +0200911 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200912 };
Akron941c1a62016-02-23 17:41:41 +0100913 };
914 }
915
916 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200917 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200918
Akron941c1a62016-02-23 17:41:41 +0100919 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200920 $log->error("Unzip is not installed or incompatible.");
921 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100922 };
923
Akron08385f62016-03-22 20:37:04 +0100924 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200925 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100926
Akron31a08cb2019-02-20 20:43:26 +0100927 # Get sigles to extract
928 my $prefix = set_sigle($archive);
929
Akron941c1a62016-02-23 17:41:41 +0100930 print "Start processing ...\n";
931 $t = Benchmark->new;
932 my @dirs = $archive->list_texts;
933 $count = scalar @dirs;
934
935 ARCHIVE_LOOP:
936 for (my $i = 0; $i < $count; $i++) {
937
938 # Split path information
939 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
940
Akrone1dbc382016-07-08 22:24:52 +0200941 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200942 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200943 get_file_name(
944 catfile($corpus, $doc, $text)
945 . '.json' . ($gzip ? '.gz' : '')
946 )
Akrone1dbc382016-07-08 22:24:52 +0200947 );
Akron941c1a62016-02-23 17:41:41 +0100948
949 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200950 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100951
Akron4c0cf312016-10-15 16:42:09 +0200952 # Create temporary file
953 $temp = File::Temp->newdir;
954
Akronbdf434a2016-10-24 17:42:07 +0200955 # TODO: Check if $filename exist at the beginning,
956 # because extraction can be horrible slow!
957
Akron941c1a62016-02-23 17:41:41 +0100958 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100959 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100960
Akron7d4cdd82016-08-17 21:39:45 +0200961 # Create corpus directory
962 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100963
Akron7d4cdd82016-08-17 21:39:45 +0200964 # Temporary directory
965 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100966
Akron7d4cdd82016-08-17 21:39:45 +0200967 # Write file
Akron13d56622016-10-31 14:54:49 +0100968 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200969
Akron4c0cf312016-10-15 16:42:09 +0200970 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100971 $pool->finish(
972 0,
Akronda3097e2017-04-23 19:53:57 +0200973 [
974 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
975 $temp,
976 $filename
977 ]
Akron13d56622016-10-31 14:54:49 +0100978 );
979 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200980 }
981 else {
Akron4c0cf312016-10-15 16:42:09 +0200982 # Delete temporary file
983 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200984 };
Akron941c1a62016-02-23 17:41:41 +0100985 }
Akron7d4cdd82016-08-17 21:39:45 +0200986
987 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100988 else {
Akron4c0cf312016-10-15 16:42:09 +0200989 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100990 };
991 };
992 }
993
994 else {
995 print "Input is neither a directory nor an archive.\n\n";
996 };
997
998 $pool->wait_all_children;
999
Akron11c80302016-03-18 19:44:43 +01001000 # Delete cache file
1001 unlink($cache_file) if $cache_delete;
1002
Akronda3097e2017-04-23 19:53:57 +02001003 # Close tar filehandle
1004 if ($to_tar && $tar_fh) {
1005 $tar_archive->finish;
1006 $tar_fh->close;
1007 print "Wrote to tar archive.\n";
1008 };
1009
Akron63f20d42017-04-10 23:40:29 +02001010 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001011 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001012};
Akron941c1a62016-02-23 17:41:41 +01001013
Nils Diewald2db9ad02013-10-29 19:26:43 +00001014
Akron31a08cb2019-02-20 20:43:26 +01001015# For an archive, this will create the list
1016# of all sigles to process
1017sub set_sigle {
1018 my $archive = shift;
1019
1020 my $prefix = 1;
1021 my @dirs = ();
1022
1023 # No sigles given
1024 unless (@sigle) {
1025
1026 # Get files
1027 foreach ($archive->list_texts) {
1028
1029 push @dirs, $_;
1030
1031 # Split path information
1032 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1033
1034 # TODO: Make this OS independent
1035 push @sigle, join '/', $corpus, $doc, $text;
1036 };
1037 }
1038
1039 # Check sigle for doc sigles
1040 else {
1041 my @new_sigle;
1042
1043 my $prefix_check = 0;
1044
1045 # Iterate over all sigle
1046 foreach (@sigle) {
1047
1048 # Sigle is a doc sigle
1049 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1050
1051 print "$_ ...";
1052 # Check if a prefix is needed
1053 unless ($prefix_check) {
1054
1055 if ($prefix = $archive->check_prefix) {
1056 print " with prefix ...";
1057 };
1058 $prefix_check = 1;
1059 };
1060
1061 print "\n";
1062
Akron31a08cb2019-02-20 20:43:26 +01001063 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001064 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1065 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001066 );
1067 print "extracted.\n";
1068 }
1069
1070 # Sigle is a text sigle
1071 else {
1072 push @new_sigle, $_;
1073
1074 unless ($prefix_check) {
1075
1076 if ($prefix = $archive->check_prefix) {
1077 print " with prefix ...";
1078 };
1079 $prefix_check = 1;
1080 };
1081 };
1082 };
1083 @sigle = @new_sigle;
1084 };
1085
1086 return $prefix;
1087};
1088
1089
1090
Akron63f20d42017-04-10 23:40:29 +02001091# Cleanup temporary extraction directory
1092if ($extract_dir) {
1093 my $objects = remove_tree($extract_dir, { safe => 1 });
1094 print "Removed directory $extract_dir with $objects objects.\n";
1095};
1096
1097
1098print "\n";
1099
Nils Diewald2db9ad02013-10-29 19:26:43 +00001100__END__
Akron941c1a62016-02-23 17:41:41 +01001101
1102=pod
1103
1104=encoding utf8
1105
1106=head1 NAME
1107
Akronf7ad89e2016-03-16 18:22:47 +01001108korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001109
1110
1111=head1 SYNOPSIS
1112
Akrona76d8352016-10-27 16:27:32 +02001113 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001114
Akron2fd402b2016-10-27 21:26:48 +02001115
Akron941c1a62016-02-23 17:41:41 +01001116=head1 DESCRIPTION
1117
1118L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1119compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001120The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001121
1122
1123=head1 INSTALLATION
1124
1125The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1126
Akronaf386982016-10-12 00:33:25 +02001127 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001128
Akronc13a1702016-03-15 19:33:14 +01001129In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001130be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001131Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001132In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001133
1134=head1 ARGUMENTS
1135
Akrona76d8352016-10-27 16:27:32 +02001136 $ korapxml2krill -z --input <directory> --output <filename>
1137
1138Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001139It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001140
Akron941c1a62016-02-23 17:41:41 +01001141=over 2
1142
1143=item B<archive>
1144
Akron081639e2017-04-21 19:01:39 +02001145 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001146
Akron2fd402b2016-10-27 21:26:48 +02001147Converts an archive of KorAP-XML documents. It expects a directory
1148(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001149
1150=item B<extract>
1151
Akrona76d8352016-10-27 16:27:32 +02001152 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1153
1154Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001155
Akron63f20d42017-04-10 23:40:29 +02001156=item B<serial>
1157
1158 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1159
1160Convert archives sequentially. The inputs are not merged but treated
1161as they are (so they may be premerged or globs).
1162the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001163are created based on the archive name. In case the C<--to-tar> flag is given,
1164the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001165
1166
Akron941c1a62016-02-23 17:41:41 +01001167=back
1168
1169
1170=head1 OPTIONS
1171
1172=over 2
1173
Akrona76d8352016-10-27 16:27:32 +02001174=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001175
Akrona76d8352016-10-27 16:27:32 +02001176Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001177
Akron7606afa2016-10-25 16:23:49 +02001178Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001179document, while C<archive> expects a KorAP-XML corpus folder or a zip
1180file to batch process multiple files.
1181C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001182
Akrona76d8352016-10-27 16:27:32 +02001183C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001184that the first archive listed contains all primary data files
1185and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001186
Akron7606afa2016-10-25 16:23:49 +02001187 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001188
Akron821db3d2017-04-06 21:19:31 +02001189Input may also be defined using BSD glob wildcards.
1190
1191 -i 'file/news*.zip'
1192
1193The extended input array will be sorted in length order, so the shortest
1194path needs to contain all primary data files and all meta data files.
1195
Akron0c3e3752016-06-28 15:55:53 +02001196(The directory structure follows the base directory format,
1197that may include a C<.> root folder.
1198In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001199need to be passed with a hash sign in front of the archive's name.
1200This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001201
Akron7606afa2016-10-25 16:23:49 +02001202To support zip files, a version of C<unzip> needs to be installed that is
1203compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001204
Akron7606afa2016-10-25 16:23:49 +02001205B<The root folder switch using the hash sign is experimental and
1206may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001207
Akronf73ffb62018-06-27 12:13:59 +02001208
Akron63f20d42017-04-10 23:40:29 +02001209=item B<--input-base|-ib> <directory>
1210
1211The base directory for inputs.
1212
1213
Akron941c1a62016-02-23 17:41:41 +01001214=item B<--output|-o> <directory|file>
1215
1216Output folder for archive processing or
1217document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001218writes to C<STDOUT> by default
1219(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001220
1221=item B<--overwrite|-w>
1222
1223Overwrite files that already exist.
1224
Akronf73ffb62018-06-27 12:13:59 +02001225
Akron3741f8b2016-12-21 19:55:21 +01001226=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001227
1228Define the default tokenization by specifying
1229the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001230of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001231
Akron3741f8b2016-12-21 19:55:21 +01001232
1233=item B<--base-sentences|-bs> <foundry>#<layer>
1234
1235Define the layer for base sentences.
1236If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001237Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1238layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001239
1240 Defaults to unset.
1241
1242
1243=item B<--base-paragraphs|-bp> <foundry>#<layer>
1244
1245Define the layer for base paragraphs.
1246If given, this will be used instead of using C<Base#Paragraphs>.
1247Currently C<DeReKo#Structure> is the only additional layer supported.
1248
1249 Defaults to unset.
1250
1251
Akron41ac10b2017-02-08 22:47:25 +01001252=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1253
1254Define the layer for base pagebreaks.
1255Currently C<DeReKo#Structure> is the only layer supported.
1256
1257 Defaults to unset.
1258
1259
Akron941c1a62016-02-23 17:41:41 +01001260=item B<--skip|-s> <foundry>[#<layer>]
1261
Akronf7ad89e2016-03-16 18:22:47 +01001262Skip specific annotations by specifying the foundry
1263(and optionally the layer with a C<#>-prefix),
1264e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001265Can be set multiple times.
1266
Akronf73ffb62018-06-27 12:13:59 +02001267
Akronc13a1702016-03-15 19:33:14 +01001268=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001269
Akronf7ad89e2016-03-16 18:22:47 +01001270Convert specific annotations by specifying the foundry
1271(and optionally the layer with a C<#>-prefix),
1272e.g. C<Mate> or C<Mate#Morpho>.
1273Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001274
Akronf73ffb62018-06-27 12:13:59 +02001275
Akron941c1a62016-02-23 17:41:41 +01001276=item B<--primary|-p>
1277
Akronc13a1702016-03-15 19:33:14 +01001278Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001279Can be flagged using C<--no-primary> as well.
1280This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001281
Akronf73ffb62018-06-27 12:13:59 +02001282
Akroned9baf02019-01-22 17:03:25 +01001283=item B<--non-word-tokens|-nwt>
1284
1285Tokenize non-word tokens like word tokens (defined as matching
1286C</[\d\w]/>). Useful to treat punctuations as tokens.
1287
1288 Defaults to unset.
1289
Akron941c1a62016-02-23 17:41:41 +01001290=item B<--jobs|-j>
1291
1292Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001293for archive processing.
Akron11c80302016-03-18 19:44:43 +01001294Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001295
1296If C<sequential-extraction> is not set to false, this will
1297also apply to extraction.
1298
Akronc11f7982017-02-21 21:20:14 +01001299Pass -1, and the value will be set automatically to 5
1300times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001301This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001302
Akronf73ffb62018-06-27 12:13:59 +02001303
Akron263274c2019-02-07 09:48:30 +01001304=item B<--koral|-k>
1305
1306Version of the output format. Supported versions are:
1307C<0> for legacy serialization, C<0.03> for serialization
1308with metadata fields as key-values on the root object,
1309C<0.4> for serialization with metadata fields as a list
1310of C<"@type":"koral:field"> objects.
1311
1312Currently defaults to C<0.03>.
1313
1314
Akron9ec88872017-04-12 16:29:06 +02001315=item B<--sequential-extraction|-se>
1316
1317Flag to indicate, if the C<jobs> value also applies to extraction.
1318Some systems may have problems with extracting multiple archives
1319to the same folder at the same time.
1320Can be flagged using C<--no-sequential-extraction> as well.
1321Defaults to C<false>.
1322
Akronf73ffb62018-06-27 12:13:59 +02001323
Akron35db6e32016-03-17 22:42:22 +01001324=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001325
Akron35db6e32016-03-17 22:42:22 +01001326Define the metadata parser to use. Defaults to C<I5>.
1327Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1328This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001329
Akronf73ffb62018-06-27 12:13:59 +02001330
Akron941c1a62016-02-23 17:41:41 +01001331=item B<--pretty|-y>
1332
Akronc13a1702016-03-15 19:33:14 +01001333Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001334This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001335
Akronf73ffb62018-06-27 12:13:59 +02001336
Akron941c1a62016-02-23 17:41:41 +01001337=item B<--gzip|-z>
1338
Akronf7ad89e2016-03-16 18:22:47 +01001339Compress the output.
1340Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001341
Akronf73ffb62018-06-27 12:13:59 +02001342
Akron11c80302016-03-18 19:44:43 +01001343=item B<--cache|-c>
1344
1345File to mmap a cache (using L<Cache::FastMmap>).
1346Defaults to C<korapxml2krill.cache> in the calling directory.
1347
Akronf73ffb62018-06-27 12:13:59 +02001348
Akron11c80302016-03-18 19:44:43 +01001349=item B<--cache-size|-cs>
1350
1351Size of the cache. Defaults to C<50m>.
1352
Akronf73ffb62018-06-27 12:13:59 +02001353
Akron11c80302016-03-18 19:44:43 +01001354=item B<--cache-init|-ci>
1355
1356Initialize cache file.
1357Can be flagged using C<--no-cache-init> as well.
1358Defaults to C<true>.
1359
Akronf73ffb62018-06-27 12:13:59 +02001360
Akron11c80302016-03-18 19:44:43 +01001361=item B<--cache-delete|-cd>
1362
1363Delete cache file after processing.
1364Can be flagged using C<--no-cache-delete> as well.
1365Defaults to C<true>.
1366
Akronf73ffb62018-06-27 12:13:59 +02001367
Akron636aa112017-04-07 18:48:56 +02001368=item B<--config|-cfg>
1369
1370Configure the parameters of your call in a file
1371of key-value pairs with whitespace separator
1372
1373 overwrite 1
1374 token DeReKo#Structure
1375 ...
1376
1377Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001378C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001379C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001380C<output>, C<koral>,
1381C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001382C<base-sentences>, C<base-paragraphs>,
1383C<base-pagebreaks>,
1384C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001385(semicolon separated), C<anno> (semicolon separated).
1386
Akronf73ffb62018-06-27 12:13:59 +02001387Configuration parameters will always be overwritten by
1388passed parameters.
1389
1390
Akron81500102017-04-07 20:45:44 +02001391=item B<--temporary-extract|-te>
1392
1393Only valid for the C<archive> command.
1394
1395This will first extract all files into a
1396directory and then will archive.
1397If the directory is given as C<:temp:>,
1398a temporary directory is used.
1399This is especially useful to avoid
1400massive unzipping and potential
1401network latency.
Akron636aa112017-04-07 18:48:56 +02001402
Akronf73ffb62018-06-27 12:13:59 +02001403
Akronc93a0802019-07-11 15:48:34 +02001404=item B<--to-tar>
1405
1406Only valid for the C<archive> command.
1407
1408Writes the output into a tar archive.
1409
1410
Akrone10ad322016-02-27 10:54:26 +01001411=item B<--sigle|-sg>
1412
Akron20807582016-10-26 17:11:34 +02001413Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001414Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001415I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001416Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001417In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001418On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001419
Akronf73ffb62018-06-27 12:13:59 +02001420
Akron941c1a62016-02-23 17:41:41 +01001421=item B<--log|-l>
1422
1423The L<Log4perl> log level, defaults to C<ERROR>.
1424
Akronf73ffb62018-06-27 12:13:59 +02001425
Akron941c1a62016-02-23 17:41:41 +01001426=item B<--help|-h>
1427
1428Print this document.
1429
Akronf73ffb62018-06-27 12:13:59 +02001430
Akron941c1a62016-02-23 17:41:41 +01001431=item B<--version|-v>
1432
1433Print version information.
1434
1435=back
1436
Akronf73ffb62018-06-27 12:13:59 +02001437
Akronc13a1702016-03-15 19:33:14 +01001438=head1 ANNOTATION SUPPORT
1439
1440L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1441developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1442The base foundry with paragraphs, sentences, and the text element are mandatory for
1443L<Krill|https://github.com/KorAP/Krill>.
1444
Akron821db3d2017-04-06 21:19:31 +02001445 Base
1446 #Paragraphs
1447 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001448
Akron821db3d2017-04-06 21:19:31 +02001449 Connexor
1450 #Morpho
1451 #Phrase
1452 #Sentences
1453 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001454
Akron821db3d2017-04-06 21:19:31 +02001455 CoreNLP
1456 #Constituency
1457 #Morpho
1458 #NamedEntities
1459 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001460
Akronce125b62017-06-19 11:54:36 +02001461 CMC
1462 #Morpho
1463
Akron821db3d2017-04-06 21:19:31 +02001464 DeReKo
1465 #Structure
Akronc13a1702016-03-15 19:33:14 +01001466
Akron57510c12019-01-04 14:58:53 +01001467 DGD
1468 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001469 #Structure
Akron57510c12019-01-04 14:58:53 +01001470
Akron821db3d2017-04-06 21:19:31 +02001471 DRuKoLa
1472 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001473
Akron821db3d2017-04-06 21:19:31 +02001474 Glemm
1475 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001476
Akronea1aed52018-07-19 14:43:34 +02001477 HNC
1478 #Morpho
1479
Akron4c679192018-01-16 17:41:49 +01001480 LWC
1481 #Dependency
1482
Akron821db3d2017-04-06 21:19:31 +02001483 Malt
1484 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001485
Akron821db3d2017-04-06 21:19:31 +02001486 MarMoT
1487 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001488
Akron821db3d2017-04-06 21:19:31 +02001489 Mate
1490 #Dependency
1491 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001492
Akron821db3d2017-04-06 21:19:31 +02001493 MDParser
1494 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001495
Akron821db3d2017-04-06 21:19:31 +02001496 OpenNLP
1497 #Morpho
1498 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001499
Akron821db3d2017-04-06 21:19:31 +02001500 Sgbr
1501 #Lemma
1502 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001503
Akron7d5e6382019-08-08 16:36:27 +02001504 Talismane
1505 #Dependency
1506 #Morpho
1507
Akron821db3d2017-04-06 21:19:31 +02001508 TreeTagger
1509 #Morpho
1510 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001511
Akron821db3d2017-04-06 21:19:31 +02001512 XIP
1513 #Constituency
1514 #Morpho
1515 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001516
Akronc13a1702016-03-15 19:33:14 +01001517
1518More importers are in preparation.
1519New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1520See the built-in annotation importers as examples.
1521
Akronf73ffb62018-06-27 12:13:59 +02001522
Akron941c1a62016-02-23 17:41:41 +01001523=head1 AVAILABILITY
1524
1525 https://github.com/KorAP/KorAP-XML-Krill
1526
1527
1528=head1 COPYRIGHT AND LICENSE
1529
Akroned9baf02019-01-22 17:03:25 +01001530Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001531
Akron941c1a62016-02-23 17:41:41 +01001532Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001533
Akrona76d8352016-10-27 16:27:32 +02001534Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001535
1536L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1537Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001538L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001539member of the
1540L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1541
1542This program is free software published under the
1543L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1544
1545=cut