blob: e524197cf9611ae6b89ddc0bf56b754850539eae [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron941c1a62016-02-23 17:41:41 +0100142# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100143
Akron9b04f602019-03-08 18:45:35 +0100144our $LAST_CHANGE = '2019/03/08';
Akron941c1a62016-02-23 17:41:41 +0100145our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100146our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100147our $VERSION_MSG = <<"VERSION";
148Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
149VERSION
150
Akron63f20d42017-04-10 23:40:29 +0200151# Prototypes
152sub get_file_name_from_glob($);
153sub get_file_name($);
154
Akron941c1a62016-02-23 17:41:41 +0100155# Parse comand
156my $cmd;
157our @ARGV;
158if ($ARGV[0] && index($ARGV[0], '-') != 0) {
159 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100160};
Akron63f20d42017-04-10 23:40:29 +0200161my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100162
Akron5f51d422016-08-16 16:26:43 +0200163my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100164my $text;
Akrone10ad322016-02-27 10:54:26 +0100165
Akron941c1a62016-02-23 17:41:41 +0100166# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000167GetOptions(
Akron08385f62016-03-22 20:37:04 +0100168 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200169 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100170 'output|o=s' => \(my $output),
171 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100172 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200173 'token|t=s' => \(my $token_base),
174 'base-sentences|bs=s' => \(my $base_sentences),
175 'base-paragraphs|bp=s' => \(my $base_paragraphs),
176 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100177 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200178 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100179 'skip|s=s' => \@skip,
180 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200181 'cache|c=s' => \(my $cache_file),
182 'config|cfg=s' => \(my $cfg_file),
183 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200184 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100185 'primary|p!' => \(my $primary),
186 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200187 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100188 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200189 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100190 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akron9ec88872017-04-12 16:29:06 +0200191 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200192 'cache-size|cs=s' => \(my $cache_size),
193 'cache-delete|cd!' => \(my $cache_delete),
194 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100195 'help|h' => sub {
196 pod2usage(
197 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200198 -verbose => 99,
199 -msg => $VERSION_MSG,
200 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100201 );
202 },
203 'version|v' => sub {
204 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200205 -verbose => 0,
206 -msg => $VERSION_MSG,
207 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100208 )
209 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000210);
211
Akron63f20d42017-04-10 23:40:29 +0200212
Akron636aa112017-04-07 18:48:56 +0200213# Load from configuration
214if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200215 my %config;
216
217 Config::Simple->import_from($cfg_file, \%config);
218
219 # Overwrite
220 if (!defined($overwrite) && defined $config{overwrite}) {
221 $overwrite = $config{overwrite};
222 };
223
224 # Gzip
225 if (!defined($gzip) && defined $config{gzip}) {
226 $gzip = $config{gzip};
227 };
228
229 # Jobs
230 if (!defined($jobs) && defined $config{jobs}) {
231 $jobs = $config{jobs};
232 };
233
Akron263274c2019-02-07 09:48:30 +0100234 # Koral version
235 if (!defined($koral) && defined $config{koral}) {
236 $koral = $config{koral};
237 };
238
Akron63f20d42017-04-10 23:40:29 +0200239 # Input root base directory
240 if (!defined($input_base) && defined $config{'input-base'}) {
241 $input_base = $config{'input-base'};
242 };
243
Akron81500102017-04-07 20:45:44 +0200244 # temporary-extract
245 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
246 $extract_dir = $config{'temporary-extract'};
247 };
248
Akron636aa112017-04-07 18:48:56 +0200249 # Token base
250 if (!defined($token_base) && defined $config{token}) {
251 $token_base = $config{token};
252 };
253
Akroned9baf02019-01-22 17:03:25 +0100254 # temporary-extract
255 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
256 $non_word_tokens = $config{'non-word-tokens'};
257 };
258
Akron636aa112017-04-07 18:48:56 +0200259 # Cache file
260 if (!defined($cache_file) && defined $config{cache}) {
261 $cache_file = $config{cache};
262 };
263
264 # Cache size
265 if (!defined($cache_size) && defined $config{'cache-size'}) {
266 $cache_size = $config{'cache-size'};
267 };
268
269 # Cache delete
270 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
271 $cache_delete = $config{'cache-delete'} ;
272 };
273
274 # Cache init
275 if (!(defined $cache_init) && defined $config{'cache-init'}) {
276 $cache_init = $config{'cache-init'} ;
277 };
278
Akron9ec88872017-04-12 16:29:06 +0200279 # Jobs for extraction
280 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
281 $sequential_extraction = $config{'sequential-extraction'} ;
282 };
283
Akron636aa112017-04-07 18:48:56 +0200284 # Meta
285 if (!(defined $meta) && defined $config{'meta'}) {
286 $meta = $config{'meta'} ;
287 };
288
289 # Output
290 if (!(defined $output) && defined $config{'output'}) {
291 $output = $config{'output'} ;
292 };
293
294 # Base-sentences
295 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
296 $base_sentences = $config{'base-sentences'} ;
297 };
298
299 # Base-paragraphs
300 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
301 $base_paragraphs = $config{'base-paragraphs'} ;
302 };
303
304 # Base-pagebreaks
305 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
306 $base_pagebreaks = $config{'base-pagebreaks'} ;
307 };
308
Akron081639e2017-04-21 19:01:39 +0200309 # Write to tar
310 if (!(defined $to_tar) && defined $config{'to-tar'}) {
311 $to_tar = $config{'to-tar'} ;
312 };
313
Akron636aa112017-04-07 18:48:56 +0200314 # Log
315 if (!(defined $log_level) && defined $config{'log'}) {
316 $log_level = $config{'log'} ;
317 };
318
319 # Skip
320 if (!scalar(@skip) && defined $config{'skip'}) {
321 @skip = split /\s*;\s*/, $config{'skip'} ;
322 };
323
324 # Sigle
325 if (!scalar(@sigle) && defined $config{'sigle'}) {
326 @sigle = split /\s*;\s*/, $config{'sigle'} ;
327 };
328
329 # Anno
330 if (!scalar(@anno) && defined $config{'anno'}) {
331 @anno = split /\s*;\s*/, $config{'anno'} ;
332 };
333};
334
Akron63f20d42017-04-10 23:40:29 +0200335
Akron636aa112017-04-07 18:48:56 +0200336# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200337$token_base //= 'OpenNLP#tokens';
338$cache_file //= 'korapxml2krill.cache';
339$cache_size //= '50m';
340$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100341$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200342$cache_delete //= 1;
343$cache_init //= 1;
344$sequential_extraction //= 0;
345$log_level //= 'ERROR';
346$base_sentences //= '';
347$base_paragraphs //= '';
348$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100349$non_word_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200350
Akron821db3d2017-04-06 21:19:31 +0200351$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100352$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100353$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100354
Akron63f20d42017-04-10 23:40:29 +0200355
356# Initialize log4perl object
357Log::Log4perl->init({
358 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
359 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
360 'log4perl.appender.STDERR.layout' => 'PatternLayout',
361 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
362});
363
364my $log = Log::Log4perl->get_logger('main');
365
366
367print "Reading config from $cfg_file\n" if $cfg_file;
368
369
Akron941c1a62016-02-23 17:41:41 +0100370my %ERROR_HASH = (
371 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200372 -verbose => 99,
373 -msg => $VERSION_MSG,
374 -output => '-',
375 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100376);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000377
Akron941c1a62016-02-23 17:41:41 +0100378# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100379pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000380
Akrone1dbc382016-07-08 22:24:52 +0200381# Gzip has no effect, if no output is given
382pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000383
Akronc11f7982017-02-21 21:20:14 +0100384
Akron636aa112017-04-07 18:48:56 +0200385if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100386 state $cores = Sys::Info->new->device('CPU')->count;
387 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200388 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100389};
390
Akron821db3d2017-04-06 21:19:31 +0200391
Akron63f20d42017-04-10 23:40:29 +0200392# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200393if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200394
Akron486f9ab2017-04-22 23:25:19 +0200395 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200396 $log->error("Directory '$output' does not exist.");
397 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200398 };
399
400 # Remove all inputs
401 my $remove_next = 0;
402 @keep_argv = @{c(@keep_argv)->grep(
403 sub {
404 # Input flag
405 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
406 $remove_next = 1;
407 return 0;
408 }
409
410 # input value
411 elsif ($remove_next) {
412 $remove_next = 0;
413 return 0;
414 };
415
416 # Pass parameter
417 return 1;
418 }
419 )->to_array};
420
421
422 # Iterate over all inputs
423 foreach (@input) {
424
Akron081639e2017-04-21 19:01:39 +0200425 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200426 my $new_out = catdir($output, get_file_name_from_glob($_));
427
Akron486f9ab2017-04-22 23:25:19 +0200428 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200429 unless ($to_tar) {
430 if (make_path($new_out) == 0 && !-d $new_out) {
431 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200432 exit 1;
Akron081639e2017-04-21 19:01:39 +0200433 };
Akron63f20d42017-04-10 23:40:29 +0200434 };
435
436 # Create archive command
437 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
438 print "Start serial processing of $_ to $new_out\n";
439
440 # Start archiving
441 system @archive_cmd;
442 };
443
Akron3abc03e2017-06-29 16:23:35 +0200444 exit;
Akron63f20d42017-04-10 23:40:29 +0200445};
446
Akrone1dbc382016-07-08 22:24:52 +0200447my %skip;
448$skip{lc($_)} = 1 foreach @skip;
449
450my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100451push(@layers, ['Base', 'Sentences']) unless $base_sentences;
452push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200453
454# Connexor
455push(@layers, ['Connexor', 'Morpho']);
456push(@layers, ['Connexor', 'Syntax']);
457push(@layers, ['Connexor', 'Phrase']);
458push(@layers, ['Connexor', 'Sentences']);
459
460# CoreNLP
461push(@layers, ['CoreNLP', 'NamedEntities']);
462push(@layers, ['CoreNLP', 'Sentences']);
463push(@layers, ['CoreNLP', 'Morpho']);
464push(@layers, ['CoreNLP', 'Constituency']);
465
Akronce125b62017-06-19 11:54:36 +0200466# CMC
467push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100468
Akrone1dbc382016-07-08 22:24:52 +0200469# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100470my @dereko_attr = ();
471if ($base_sentences eq 'dereko#structure') {
472 push @dereko_attr, 'sentences';
473};
474if ($base_paragraphs eq 'dereko#structure') {
475 push @dereko_attr, 'paragraphs';
476};
Akron636bd9c2017-02-09 17:13:00 +0100477
Akron41ac10b2017-02-08 22:47:25 +0100478if ($base_pagebreaks eq 'dereko#structure') {
479 push @dereko_attr, 'pagebreaks';
480};
481
482if ($dereko_attr[0]) {
483 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100484}
485else {
486 push(@layers, ['DeReKo', 'Structure']);
487};
Akrone1dbc382016-07-08 22:24:52 +0200488
Akron57510c12019-01-04 14:58:53 +0100489# DGD
490push(@layers, ['DGD', 'Morpho']);
491
492# DRuKoLa
493push(@layers, ['DRuKoLa', 'Morpho']);
494
Akrone1dbc382016-07-08 22:24:52 +0200495# Glemm
496push(@layers, ['Glemm', 'Morpho']);
497
Akronea1aed52018-07-19 14:43:34 +0200498# HNC
499push(@layers, ['HNC', 'Morpho']);
500
Akron4c679192018-01-16 17:41:49 +0100501# LWC
502push(@layers, ['LWC', 'Dependency']);
503
Akrone1dbc382016-07-08 22:24:52 +0200504# Malt
505push(@layers, ['Malt', 'Dependency']);
506
Akron57510c12019-01-04 14:58:53 +0100507# Marmot
508push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200509
510# Mate
511push(@layers, ['Mate', 'Morpho']);
512push(@layers, ['Mate', 'Dependency']);
513
Akron57510c12019-01-04 14:58:53 +0100514# MDParser
515push(@layers, ['MDParser', 'Dependency']);
516
Akrone1dbc382016-07-08 22:24:52 +0200517# OpenNLP
518push(@layers, ['OpenNLP', 'Morpho']);
519push(@layers, ['OpenNLP', 'Sentences']);
520
521# Schreibgebrauch
522push(@layers, ['Sgbr', 'Lemma']);
523push(@layers, ['Sgbr', 'Morpho']);
524
525# TreeTagger
526push(@layers, ['TreeTagger', 'Morpho']);
527push(@layers, ['TreeTagger', 'Sentences']);
528
529# XIP
530push(@layers, ['XIP', 'Morpho']);
531push(@layers, ['XIP', 'Constituency']);
532push(@layers, ['XIP', 'Sentences']);
533push(@layers, ['XIP', 'Dependency']);
534
Akron4fa37c32017-01-20 14:43:10 +0100535
Akrone1dbc382016-07-08 22:24:52 +0200536# Check filters
537my @filtered_anno;
538if ($skip{'#all'}) {
539 foreach (@anno) {
540 push @filtered_anno, [ split('#', $_) ];
541 };
542}
543
544# Add all annotations that are not skipped
545else {
546 # Add to index file - respect skipping
547 foreach my $info (@layers) {
548 # Skip if Foundry or Foundry#Layer should be skipped
549 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
550 push @filtered_anno, $info;
551 };
552 };
553};
554
555# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200556my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
557
558# Remove file extension
559$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200560
561# TODO: This should not be initialized for batch
562my $cache = Cache::FastMmap->new(
563 share_file => $cache_file,
564 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200565 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200566);
567
Akron03b24db2016-08-16 20:54:32 +0200568# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200569my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200570 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200571 meta_type => $meta,
572 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200573 foundry => $token_base_foundry,
574 layer => $token_base_layer,
575 gzip => $gzip,
576 log => $log,
Akron263274c2019-02-07 09:48:30 +0100577 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200578 primary => $primary,
579 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100580 anno => \@filtered_anno,
581 non_word_tokens => $non_word_tokens
Akrone1dbc382016-07-08 22:24:52 +0200582);
583
Akron941c1a62016-02-23 17:41:41 +0100584# Get file name based on path information
585sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100586 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200587 if (-d $i) {
588 $i =~ s![^\/]+$!!;
589 };
Akron941c1a62016-02-23 17:41:41 +0100590 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200591
592 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200593 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100594 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100595 $file =~ tr/\//-/;
596 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200597 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100598 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000599};
600
Akron63f20d42017-04-10 23:40:29 +0200601
602sub get_file_name_from_glob ($) {
603 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200604 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200605 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
606 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
607 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
608 $glob =~ s/^-//; # Clean beginning
609 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200610 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200611 return $glob;
612};
613
614
Akrone10ad322016-02-27 10:54:26 +0100615# Convert sigle to path construct
616s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
617
Akron7d4cdd82016-08-17 21:39:45 +0200618if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200619 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200620 $log->error("Directory '$output' does not exist.");
621 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200622 };
623};
624
Akron63f20d42017-04-10 23:40:29 +0200625
626# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200627if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200628
Akron821db3d2017-04-06 21:19:31 +0200629 my @new_input = ();
630
631 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200632 foreach my $wild_card (@input) {
633
634 # Prefix with input root
635 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
636
637 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200638 };
639
Akron63f20d42017-04-10 23:40:29 +0200640 # Sort files by length
641 @input = sort { length($a) <=> length($b) } @new_input;
642
643 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200644};
645
646
Akron941c1a62016-02-23 17:41:41 +0100647# Process a single file
648unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100649 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000650
Akron941c1a62016-02-23 17:41:41 +0100651 BEGIN {
652 $main::TIME = Benchmark->new;
653 $main::LAST_STOP = Benchmark->new;
654 };
655
656 sub stop_time {
657 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200658 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100659 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200660 timestr(timediff($new, $main::LAST_STOP)) .
661 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
662 );
Akron941c1a62016-02-23 17:41:41 +0100663 $main::LAST_STOP = $new;
664 };
665
666 # Create and parse new document
667 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100668
Akron7d4cdd82016-08-17 21:39:45 +0200669 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200670 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100671
Akron11c80302016-03-18 19:44:43 +0100672 # Delete cache file
673 unlink($cache_file) if $cache_delete;
674
Akron5f51d422016-08-16 16:26:43 +0200675 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200676 exit;
Akron81500102017-04-07 20:45:44 +0200677};
678
Nils Diewald59094f22014-11-05 18:20:50 +0000679
Akrone10ad322016-02-27 10:54:26 +0100680# Extract XML files
Akron81500102017-04-07 20:45:44 +0200681if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100682
Akrond5643ad2017-07-04 20:27:13 +0200683 # Output is required
684 pod2usage(%ERROR_HASH) unless $output;
685
Akron7d4cdd82016-08-17 21:39:45 +0200686 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200687 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100688
Akron7d4cdd82016-08-17 21:39:45 +0200689 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100690 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200691 $log->error("Unzip is not installed or incompatible.");
692 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100693 };
694
Akronb0c88db2016-06-29 16:33:18 +0200695 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200696 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200697
Akron31a08cb2019-02-20 20:43:26 +0100698 # Will set @sigle
699 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200700
Akron31a08cb2019-02-20 20:43:26 +0100701# my $prefix = 1;
702#
703# # No sigles given
704# unless (@sigle) {
705#
706# # Get files
707# foreach ($archive->list_texts) {
708#
709# # Split path information
710# ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
711#
712# # TODO: Make this OS independent
713# push @sigle, join '/', $corpus, $doc, $text;
714# };
715# }
716#
717# # Check sigle for doc sigles
718# else {
719# my @new_sigle;
720#
721# my $prefix_check = 0;
722#
723# # Iterate over all sigle
724# foreach (@sigle) {
725#
726# # Sigle is a doc sigle
727# if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
728#
729# print "$_ ...";
730# # Check if a prefix is needed
731# unless ($prefix_check) {
732#
733# if ($prefix = $archive->check_prefix) {
734# print " with prefix ...";
735# };
736# $prefix_check = 1;
737# };
738#
739# print "\n";
740#
741# # TODO: Make this OS independent
742# my $path = ($prefix ? './' : '') . $_;
743#
744# print '... ' . (
745# $archive->extract_doc(
746# $path, $output, $sequential_extraction ? 1 : $jobs
747# ) ? '' : 'not '
748# );
749# print "extracted.\n";
750# }
751#
752# # Sigle is a text sigle
753# else {
754# push @new_sigle, $_;
755#
756# unless ($prefix_check) {
757#
758# if ($prefix = $archive->check_prefix) {
759# print " with prefix ...";
760# };
761# $prefix_check = 1;
762# };
763# };
764# };
765# @sigle = @new_sigle;
766# };
Akron03b24db2016-08-16 20:54:32 +0200767
Akrone10ad322016-02-27 10:54:26 +0100768 # Iterate over all given sigles and extract
769 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100770
Akron2812ba22016-10-28 21:55:59 +0200771 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200772
Akron03b24db2016-08-16 20:54:32 +0200773 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200774 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100775
Akron955b75b2019-02-21 14:28:41 +0100776 # TODO:
777 # - prefix???
778 $archive->extract_sigle([$_], $output, $jobs)
779 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200780 );
Akrone10ad322016-02-27 10:54:26 +0100781 print "extracted.\n";
782 };
Akronb0c88db2016-06-29 16:33:18 +0200783 }
Akron7d4cdd82016-08-17 21:39:45 +0200784
785 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200786 else {
787 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200788 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100789 };
790}
791
Akron81500102017-04-07 20:45:44 +0200792
Akron941c1a62016-02-23 17:41:41 +0100793# Process an archive
794elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000795
Akron81500102017-04-07 20:45:44 +0200796 my $archive_output;
797
798 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100799 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200800
801 # Create new archive object
802 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
803
804 # Check zip capabilities
805 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200806 $log->error("Unzip is not installed or incompatible.");
807 exit 1;
Akron81500102017-04-07 20:45:44 +0200808 };
809
810 # Add further annotation archived
811 $archive->attach($_) foreach @input[1..$#input];
812
813 # Create a temporary directory
814 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200815 $extract_dir = tempdir(CLEANUP => 0);
816 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200817 };
818
Akron63f20d42017-04-10 23:40:29 +0200819 # Add some random extra to avoid clashes with multiple archives
820 $extract_dir = catdir($extract_dir, random_string('cccccc'));
821
Akron31a08cb2019-02-20 20:43:26 +0100822 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200823 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200824 @input = ($extract_dir);
825 }
826 else {
827 $log->error('Unable to extract from primary archive ' . $input[0] .
828 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200829 exit 1;
Akron81500102017-04-07 20:45:44 +0200830 };
831 }
832
833 # Can't create archive object
834 else {
835 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200836 exit 1;
Akron81500102017-04-07 20:45:44 +0200837 };
838 };
839
Akron7d4cdd82016-08-17 21:39:45 +0200840 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100841 my $pool = Parallel::ForkManager->new($jobs);
842
Akron7d4cdd82016-08-17 21:39:45 +0200843 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100844 my $iter = 1; # Current text in process
845
Akronda3097e2017-04-23 19:53:57 +0200846 my $tar_archive;
847 my $output_dir = $output;
848 my $tar_fh;
849
850 # Initialize tar archive
851 if ($to_tar) {
852 $tar_archive = Archive::Tar::Builder->new(
853 ignore_errors => 1
854 );
855
856 # Set output name
857 my $tar_file = $output;
858 unless ($tar_file =~ /\.tar$/) {
859 $tar_file .= '.tar';
860 };
861
862 # Initiate the tar file
863 print "Writing to file $tar_file\n";
864 $tar_fh = IO::File->new($tar_file, 'w');
865 $tar_fh->binmode(1);
866
867 # Set handle
868 $tar_archive->set_handle($tar_fh);
869
870 # Output to temporary directory
871 $output_dir = File::Temp->newdir;
872 };
873
Akron941c1a62016-02-23 17:41:41 +0100874 # Report on fork message
875 $pool->run_on_finish (
876 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200877 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100878 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200879
Akron08385f62016-03-22 20:37:04 +0100880 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200881 ($iter++) . "/$count]" .
882 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200883 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200884
885 if (!$code && $to_tar && $data->[2]) {
886 my $filename = $data->[2];
887
888 # Lock filehandle
889 if (flock($tar_fh, LOCK_EX)) {
890
Akron9a062ce2017-07-04 19:12:05 +0200891 my $clean_file = fileparse($filename);
892
Akronda3097e2017-04-23 19:53:57 +0200893 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200894 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200895 unlink $filename;
896
897 # Unlock filehandle
898 flock($tar_fh, LOCK_UN);
899 }
900 else {
901 $log->warn("Unable to add $filename to archive");
902 };
903 };
904
Akron4c0cf312016-10-15 16:42:09 +0200905 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100906 }
907 );
908
909 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200910 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100911 print "Reading data ...\n";
912
Akron7d4cdd82016-08-17 21:39:45 +0200913 # unless (Cache::FastMmap->new(
914 # share_file => $cache_file,
915 # cache_size => $cache_size,
916 # init_file => $cache_init
917 # )) {
918 # print "Unable to intialize cache '$cache_file'\n\n";
919 # exit(1);
920 # };
Akron11c80302016-03-18 19:44:43 +0100921
Akron486f9ab2017-04-22 23:25:19 +0200922
Akron941c1a62016-02-23 17:41:41 +0100923 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100924 if (-d $input[0]) {
925 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100926 my @dirs;
927 my $dir;
928
Akron7d4cdd82016-08-17 21:39:45 +0200929 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100930 while (1) {
931 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200932 push @dirs, $dir;
933 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100934 };
935 last unless $it->next;
936 };
937
938 print "Start processing ...\n";
939 $t = Benchmark->new;
940 $count = scalar @dirs;
941
942 DIRECTORY_LOOP:
943 for (my $i = 0; $i < $count; $i++) {
944
Akrone1dbc382016-07-08 22:24:52 +0200945 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200946 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200947 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200948 );
Akron941c1a62016-02-23 17:41:41 +0100949
950 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200951 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200952
Akron13d56622016-10-31 14:54:49 +0100953 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200954 $pool->finish(
955 0,
Akronda3097e2017-04-23 19:53:57 +0200956 [
957 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
958 undef,
959 $filename
960 ]
Akron486f9ab2017-04-22 23:25:19 +0200961 );
Akron3ec48972016-08-17 23:24:52 +0200962 }
963 else {
Akron4c0cf312016-10-15 16:42:09 +0200964 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200965 };
Akron941c1a62016-02-23 17:41:41 +0100966 };
967 }
968
969 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200970 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200971
Akron941c1a62016-02-23 17:41:41 +0100972 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200973 $log->error("Unzip is not installed or incompatible.");
974 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100975 };
976
Akron08385f62016-03-22 20:37:04 +0100977 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200978 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100979
Akron31a08cb2019-02-20 20:43:26 +0100980 # Get sigles to extract
981 my $prefix = set_sigle($archive);
982
Akron941c1a62016-02-23 17:41:41 +0100983 print "Start processing ...\n";
984 $t = Benchmark->new;
985 my @dirs = $archive->list_texts;
986 $count = scalar @dirs;
987
988 ARCHIVE_LOOP:
989 for (my $i = 0; $i < $count; $i++) {
990
991 # Split path information
992 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
993
Akrone1dbc382016-07-08 22:24:52 +0200994 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200995 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200996 get_file_name(
997 catfile($corpus, $doc, $text)
998 . '.json' . ($gzip ? '.gz' : '')
999 )
Akrone1dbc382016-07-08 22:24:52 +02001000 );
Akron941c1a62016-02-23 17:41:41 +01001001
1002 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +02001003 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +01001004
Akron4c0cf312016-10-15 16:42:09 +02001005 # Create temporary file
1006 $temp = File::Temp->newdir;
1007
Akronbdf434a2016-10-24 17:42:07 +02001008 # TODO: Check if $filename exist at the beginning,
1009 # because extraction can be horrible slow!
1010
Akron941c1a62016-02-23 17:41:41 +01001011 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +01001012 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +01001013
Akron7d4cdd82016-08-17 21:39:45 +02001014 # Create corpus directory
1015 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +01001016
Akron7d4cdd82016-08-17 21:39:45 +02001017 # Temporary directory
1018 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +01001019
Akron7d4cdd82016-08-17 21:39:45 +02001020 # Write file
Akron13d56622016-10-31 14:54:49 +01001021 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +02001022
Akron4c0cf312016-10-15 16:42:09 +02001023 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +01001024 $pool->finish(
1025 0,
Akronda3097e2017-04-23 19:53:57 +02001026 [
1027 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
1028 $temp,
1029 $filename
1030 ]
Akron13d56622016-10-31 14:54:49 +01001031 );
1032 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001033 }
1034 else {
Akron4c0cf312016-10-15 16:42:09 +02001035 # Delete temporary file
1036 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +02001037 };
Akron941c1a62016-02-23 17:41:41 +01001038 }
Akron7d4cdd82016-08-17 21:39:45 +02001039
1040 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +01001041 else {
Akron4c0cf312016-10-15 16:42:09 +02001042 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +01001043 };
1044 };
1045 }
1046
1047 else {
1048 print "Input is neither a directory nor an archive.\n\n";
1049 };
1050
1051 $pool->wait_all_children;
1052
Akron11c80302016-03-18 19:44:43 +01001053 # Delete cache file
1054 unlink($cache_file) if $cache_delete;
1055
Akronda3097e2017-04-23 19:53:57 +02001056 # Close tar filehandle
1057 if ($to_tar && $tar_fh) {
1058 $tar_archive->finish;
1059 $tar_fh->close;
1060 print "Wrote to tar archive.\n";
1061 };
1062
Akron63f20d42017-04-10 23:40:29 +02001063 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001064 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001065};
Akron941c1a62016-02-23 17:41:41 +01001066
Nils Diewald2db9ad02013-10-29 19:26:43 +00001067
Akron31a08cb2019-02-20 20:43:26 +01001068# For an archive, this will create the list
1069# of all sigles to process
1070sub set_sigle {
1071 my $archive = shift;
1072
1073 my $prefix = 1;
1074 my @dirs = ();
1075
1076 # No sigles given
1077 unless (@sigle) {
1078
1079 # Get files
1080 foreach ($archive->list_texts) {
1081
1082 push @dirs, $_;
1083
1084 # Split path information
1085 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1086
1087 # TODO: Make this OS independent
1088 push @sigle, join '/', $corpus, $doc, $text;
1089 };
1090 }
1091
1092 # Check sigle for doc sigles
1093 else {
1094 my @new_sigle;
1095
1096 my $prefix_check = 0;
1097
1098 # Iterate over all sigle
1099 foreach (@sigle) {
1100
1101 # Sigle is a doc sigle
1102 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1103
1104 print "$_ ...";
1105 # Check if a prefix is needed
1106 unless ($prefix_check) {
1107
1108 if ($prefix = $archive->check_prefix) {
1109 print " with prefix ...";
1110 };
1111 $prefix_check = 1;
1112 };
1113
1114 print "\n";
1115
Akron31a08cb2019-02-20 20:43:26 +01001116 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001117 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1118 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001119 );
1120 print "extracted.\n";
1121 }
1122
1123 # Sigle is a text sigle
1124 else {
1125 push @new_sigle, $_;
1126
1127 unless ($prefix_check) {
1128
1129 if ($prefix = $archive->check_prefix) {
1130 print " with prefix ...";
1131 };
1132 $prefix_check = 1;
1133 };
1134 };
1135 };
1136 @sigle = @new_sigle;
1137 };
1138
1139 return $prefix;
1140};
1141
1142
1143
Akron63f20d42017-04-10 23:40:29 +02001144# Cleanup temporary extraction directory
1145if ($extract_dir) {
1146 my $objects = remove_tree($extract_dir, { safe => 1 });
1147 print "Removed directory $extract_dir with $objects objects.\n";
1148};
1149
1150
1151print "\n";
1152
Nils Diewald2db9ad02013-10-29 19:26:43 +00001153__END__
Akron941c1a62016-02-23 17:41:41 +01001154
1155=pod
1156
1157=encoding utf8
1158
1159=head1 NAME
1160
Akronf7ad89e2016-03-16 18:22:47 +01001161korapxml2krill - Merge KorapXML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001162
1163
1164=head1 SYNOPSIS
1165
Akrona76d8352016-10-27 16:27:32 +02001166 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001167
Akron2fd402b2016-10-27 21:26:48 +02001168
Akron941c1a62016-02-23 17:41:41 +01001169=head1 DESCRIPTION
1170
1171L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1172compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akronf7ad89e2016-03-16 18:22:47 +01001173The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron941c1a62016-02-23 17:41:41 +01001174
1175
1176=head1 INSTALLATION
1177
1178The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1179
Akronaf386982016-10-12 00:33:25 +02001180 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001181
Akronc13a1702016-03-15 19:33:14 +01001182In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001183be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001184Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001185In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001186
1187=head1 ARGUMENTS
1188
Akrona76d8352016-10-27 16:27:32 +02001189 $ korapxml2krill -z --input <directory> --output <filename>
1190
1191Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001192It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001193
Akron941c1a62016-02-23 17:41:41 +01001194=over 2
1195
1196=item B<archive>
1197
Akron081639e2017-04-21 19:01:39 +02001198 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001199
Akron2fd402b2016-10-27 21:26:48 +02001200Converts an archive of KorAP-XML documents. It expects a directory
1201(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001202
1203=item B<extract>
1204
Akrona76d8352016-10-27 16:27:32 +02001205 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1206
1207Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001208
Akron63f20d42017-04-10 23:40:29 +02001209=item B<serial>
1210
1211 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1212
1213Convert archives sequentially. The inputs are not merged but treated
1214as they are (so they may be premerged or globs).
1215the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001216are created based on the archive name. In case the C<--to-tar> flag is given,
1217the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001218
1219
Akron941c1a62016-02-23 17:41:41 +01001220=back
1221
1222
1223=head1 OPTIONS
1224
1225=over 2
1226
Akrona76d8352016-10-27 16:27:32 +02001227=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001228
Akrona76d8352016-10-27 16:27:32 +02001229Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001230
Akron7606afa2016-10-25 16:23:49 +02001231Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001232document, while C<archive> expects a KorAP-XML corpus folder or a zip
1233file to batch process multiple files.
1234C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001235
Akrona76d8352016-10-27 16:27:32 +02001236C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001237that the first archive listed contains all primary data files
1238and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001239
Akron7606afa2016-10-25 16:23:49 +02001240 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001241
Akron821db3d2017-04-06 21:19:31 +02001242Input may also be defined using BSD glob wildcards.
1243
1244 -i 'file/news*.zip'
1245
1246The extended input array will be sorted in length order, so the shortest
1247path needs to contain all primary data files and all meta data files.
1248
Akron0c3e3752016-06-28 15:55:53 +02001249(The directory structure follows the base directory format,
1250that may include a C<.> root folder.
1251In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001252need to be passed with a hash sign in front of the archive's name.
1253This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001254
Akron7606afa2016-10-25 16:23:49 +02001255To support zip files, a version of C<unzip> needs to be installed that is
1256compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001257
Akron7606afa2016-10-25 16:23:49 +02001258B<The root folder switch using the hash sign is experimental and
1259may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001260
Akronf73ffb62018-06-27 12:13:59 +02001261
Akron63f20d42017-04-10 23:40:29 +02001262=item B<--input-base|-ib> <directory>
1263
1264The base directory for inputs.
1265
1266
Akron941c1a62016-02-23 17:41:41 +01001267=item B<--output|-o> <directory|file>
1268
1269Output folder for archive processing or
1270document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001271writes to C<STDOUT> by default
1272(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001273
1274=item B<--overwrite|-w>
1275
1276Overwrite files that already exist.
1277
Akronf73ffb62018-06-27 12:13:59 +02001278
Akron3741f8b2016-12-21 19:55:21 +01001279=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001280
1281Define the default tokenization by specifying
1282the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001283of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron941c1a62016-02-23 17:41:41 +01001284
Akron3741f8b2016-12-21 19:55:21 +01001285
1286=item B<--base-sentences|-bs> <foundry>#<layer>
1287
1288Define the layer for base sentences.
1289If given, this will be used instead of using C<Base#Sentences>.
1290Currently C<DeReKo#Structure> is the only additional layer supported.
1291
1292 Defaults to unset.
1293
1294
1295=item B<--base-paragraphs|-bp> <foundry>#<layer>
1296
1297Define the layer for base paragraphs.
1298If given, this will be used instead of using C<Base#Paragraphs>.
1299Currently C<DeReKo#Structure> is the only additional layer supported.
1300
1301 Defaults to unset.
1302
1303
Akron41ac10b2017-02-08 22:47:25 +01001304=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1305
1306Define the layer for base pagebreaks.
1307Currently C<DeReKo#Structure> is the only layer supported.
1308
1309 Defaults to unset.
1310
1311
Akron941c1a62016-02-23 17:41:41 +01001312=item B<--skip|-s> <foundry>[#<layer>]
1313
Akronf7ad89e2016-03-16 18:22:47 +01001314Skip specific annotations by specifying the foundry
1315(and optionally the layer with a C<#>-prefix),
1316e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001317Can be set multiple times.
1318
Akronf73ffb62018-06-27 12:13:59 +02001319
Akronc13a1702016-03-15 19:33:14 +01001320=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001321
Akronf7ad89e2016-03-16 18:22:47 +01001322Convert specific annotations by specifying the foundry
1323(and optionally the layer with a C<#>-prefix),
1324e.g. C<Mate> or C<Mate#Morpho>.
1325Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001326
Akronf73ffb62018-06-27 12:13:59 +02001327
Akron941c1a62016-02-23 17:41:41 +01001328=item B<--primary|-p>
1329
Akronc13a1702016-03-15 19:33:14 +01001330Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001331Can be flagged using C<--no-primary> as well.
1332This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001333
Akronf73ffb62018-06-27 12:13:59 +02001334
Akroned9baf02019-01-22 17:03:25 +01001335=item B<--non-word-tokens|-nwt>
1336
1337Tokenize non-word tokens like word tokens (defined as matching
1338C</[\d\w]/>). Useful to treat punctuations as tokens.
1339
1340 Defaults to unset.
1341
Akron941c1a62016-02-23 17:41:41 +01001342=item B<--jobs|-j>
1343
1344Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001345for archive processing.
Akron11c80302016-03-18 19:44:43 +01001346Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001347
1348If C<sequential-extraction> is not set to false, this will
1349also apply to extraction.
1350
Akronc11f7982017-02-21 21:20:14 +01001351Pass -1, and the value will be set automatically to 5
1352times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001353This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001354
Akronf73ffb62018-06-27 12:13:59 +02001355
Akron263274c2019-02-07 09:48:30 +01001356=item B<--koral|-k>
1357
1358Version of the output format. Supported versions are:
1359C<0> for legacy serialization, C<0.03> for serialization
1360with metadata fields as key-values on the root object,
1361C<0.4> for serialization with metadata fields as a list
1362of C<"@type":"koral:field"> objects.
1363
1364Currently defaults to C<0.03>.
1365
1366
Akron9ec88872017-04-12 16:29:06 +02001367=item B<--sequential-extraction|-se>
1368
1369Flag to indicate, if the C<jobs> value also applies to extraction.
1370Some systems may have problems with extracting multiple archives
1371to the same folder at the same time.
1372Can be flagged using C<--no-sequential-extraction> as well.
1373Defaults to C<false>.
1374
Akronf73ffb62018-06-27 12:13:59 +02001375
Akron35db6e32016-03-17 22:42:22 +01001376=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001377
Akron35db6e32016-03-17 22:42:22 +01001378Define the metadata parser to use. Defaults to C<I5>.
1379Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1380This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001381
Akronf73ffb62018-06-27 12:13:59 +02001382
Akron941c1a62016-02-23 17:41:41 +01001383=item B<--pretty|-y>
1384
Akronc13a1702016-03-15 19:33:14 +01001385Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001386This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001387
Akronf73ffb62018-06-27 12:13:59 +02001388
Akron941c1a62016-02-23 17:41:41 +01001389=item B<--gzip|-z>
1390
Akronf7ad89e2016-03-16 18:22:47 +01001391Compress the output.
1392Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001393
Akronf73ffb62018-06-27 12:13:59 +02001394
Akron11c80302016-03-18 19:44:43 +01001395=item B<--cache|-c>
1396
1397File to mmap a cache (using L<Cache::FastMmap>).
1398Defaults to C<korapxml2krill.cache> in the calling directory.
1399
Akronf73ffb62018-06-27 12:13:59 +02001400
Akron11c80302016-03-18 19:44:43 +01001401=item B<--cache-size|-cs>
1402
1403Size of the cache. Defaults to C<50m>.
1404
Akronf73ffb62018-06-27 12:13:59 +02001405
Akron11c80302016-03-18 19:44:43 +01001406=item B<--cache-init|-ci>
1407
1408Initialize cache file.
1409Can be flagged using C<--no-cache-init> as well.
1410Defaults to C<true>.
1411
Akronf73ffb62018-06-27 12:13:59 +02001412
Akron11c80302016-03-18 19:44:43 +01001413=item B<--cache-delete|-cd>
1414
1415Delete cache file after processing.
1416Can be flagged using C<--no-cache-delete> as well.
1417Defaults to C<true>.
1418
Akronf73ffb62018-06-27 12:13:59 +02001419
Akron636aa112017-04-07 18:48:56 +02001420=item B<--config|-cfg>
1421
1422Configure the parameters of your call in a file
1423of key-value pairs with whitespace separator
1424
1425 overwrite 1
1426 token DeReKo#Structure
1427 ...
1428
1429Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001430C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001431C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001432C<output>, C<koral>,
1433C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001434C<base-sentences>, C<base-paragraphs>,
1435C<base-pagebreaks>,
1436C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001437(semicolon separated), C<anno> (semicolon separated).
1438
Akronf73ffb62018-06-27 12:13:59 +02001439Configuration parameters will always be overwritten by
1440passed parameters.
1441
1442
Akron81500102017-04-07 20:45:44 +02001443=item B<--temporary-extract|-te>
1444
1445Only valid for the C<archive> command.
1446
1447This will first extract all files into a
1448directory and then will archive.
1449If the directory is given as C<:temp:>,
1450a temporary directory is used.
1451This is especially useful to avoid
1452massive unzipping and potential
1453network latency.
Akron636aa112017-04-07 18:48:56 +02001454
Akronf73ffb62018-06-27 12:13:59 +02001455
Akrone10ad322016-02-27 10:54:26 +01001456=item B<--sigle|-sg>
1457
Akron20807582016-10-26 17:11:34 +02001458Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001459Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001460I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001461Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001462In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001463On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001464
Akronf73ffb62018-06-27 12:13:59 +02001465
Akron941c1a62016-02-23 17:41:41 +01001466=item B<--log|-l>
1467
1468The L<Log4perl> log level, defaults to C<ERROR>.
1469
Akronf73ffb62018-06-27 12:13:59 +02001470
Akron941c1a62016-02-23 17:41:41 +01001471=item B<--help|-h>
1472
1473Print this document.
1474
Akronf73ffb62018-06-27 12:13:59 +02001475
Akron941c1a62016-02-23 17:41:41 +01001476=item B<--version|-v>
1477
1478Print version information.
1479
1480=back
1481
Akronf73ffb62018-06-27 12:13:59 +02001482
Akronc13a1702016-03-15 19:33:14 +01001483=head1 ANNOTATION SUPPORT
1484
1485L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1486developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1487The base foundry with paragraphs, sentences, and the text element are mandatory for
1488L<Krill|https://github.com/KorAP/Krill>.
1489
Akron821db3d2017-04-06 21:19:31 +02001490 Base
1491 #Paragraphs
1492 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001493
Akron821db3d2017-04-06 21:19:31 +02001494 Connexor
1495 #Morpho
1496 #Phrase
1497 #Sentences
1498 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001499
Akron821db3d2017-04-06 21:19:31 +02001500 CoreNLP
1501 #Constituency
1502 #Morpho
1503 #NamedEntities
1504 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001505
Akronce125b62017-06-19 11:54:36 +02001506 CMC
1507 #Morpho
1508
Akron821db3d2017-04-06 21:19:31 +02001509 DeReKo
1510 #Structure
Akronc13a1702016-03-15 19:33:14 +01001511
Akron57510c12019-01-04 14:58:53 +01001512 DGD
1513 #Morpho
1514
Akron821db3d2017-04-06 21:19:31 +02001515 DRuKoLa
1516 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001517
Akron821db3d2017-04-06 21:19:31 +02001518 Glemm
1519 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001520
Akronea1aed52018-07-19 14:43:34 +02001521 HNC
1522 #Morpho
1523
Akron4c679192018-01-16 17:41:49 +01001524 LWC
1525 #Dependency
1526
Akron821db3d2017-04-06 21:19:31 +02001527 Malt
1528 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001529
Akron821db3d2017-04-06 21:19:31 +02001530 MarMoT
1531 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001532
Akron821db3d2017-04-06 21:19:31 +02001533 Mate
1534 #Dependency
1535 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001536
Akron821db3d2017-04-06 21:19:31 +02001537 MDParser
1538 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001539
Akron821db3d2017-04-06 21:19:31 +02001540 OpenNLP
1541 #Morpho
1542 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001543
Akron821db3d2017-04-06 21:19:31 +02001544 Sgbr
1545 #Lemma
1546 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001547
Akron821db3d2017-04-06 21:19:31 +02001548 TreeTagger
1549 #Morpho
1550 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001551
Akron821db3d2017-04-06 21:19:31 +02001552 XIP
1553 #Constituency
1554 #Morpho
1555 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001556
Akronc13a1702016-03-15 19:33:14 +01001557
1558More importers are in preparation.
1559New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1560See the built-in annotation importers as examples.
1561
Akronf73ffb62018-06-27 12:13:59 +02001562
Akron941c1a62016-02-23 17:41:41 +01001563=head1 AVAILABILITY
1564
1565 https://github.com/KorAP/KorAP-XML-Krill
1566
1567
1568=head1 COPYRIGHT AND LICENSE
1569
Akroned9baf02019-01-22 17:03:25 +01001570Copyright (C) 2015-2019, L<IDS Mannheim|http://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001571
Akron941c1a62016-02-23 17:41:41 +01001572Author: L<Nils Diewald|http://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001573
Akrona76d8352016-10-27 16:27:32 +02001574Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001575
1576L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1577Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001578L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001579member of the
1580L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
1581
1582This program is free software published under the
1583L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1584
1585=cut