blob: 06db4b6c44893e31a0abc5d716b143a1d485ee2b [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akron941c1a62016-02-23 17:41:41 +01004use FindBin;
5BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
6use File::Spec::Functions qw/catfile catdir/;
7use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00008use Benchmark qw/:hireswallclock/;
9use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010010use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000011use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010012use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010013use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010014use Directory::Iterator;
Akron93d620e2016-02-05 19:40:05 +010015use KorAP::XML::Krill;
Akron941c1a62016-02-23 17:41:41 +010016use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010017use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020018use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020019use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010020use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010021use v5.10;
22use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
Akronf1849aa2019-12-16 23:35:33 +0100146# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100149# - Support for non-verbal annotations.
Akron941c1a62016-02-23 17:41:41 +0100150# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100151
Akronf1849aa2019-12-16 23:35:33 +0100152our $LAST_CHANGE = '2019/12/17';
Akron941c1a62016-02-23 17:41:41 +0100153our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100154our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100155our $VERSION_MSG = <<"VERSION";
156Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
157VERSION
158
Akron63f20d42017-04-10 23:40:29 +0200159# Prototypes
160sub get_file_name_from_glob($);
161sub get_file_name($);
162
Akron941c1a62016-02-23 17:41:41 +0100163# Parse comand
164my $cmd;
165our @ARGV;
166if ($ARGV[0] && index($ARGV[0], '-') != 0) {
167 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100168};
Akron63f20d42017-04-10 23:40:29 +0200169my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100170
Akron5f51d422016-08-16 16:26:43 +0200171my (@skip, @sigle, @anno, @input);
Akron35db6e32016-03-17 22:42:22 +0100172my $text;
Akrone10ad322016-02-27 10:54:26 +0100173
Akron941c1a62016-02-23 17:41:41 +0100174# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000175GetOptions(
Akron08385f62016-03-22 20:37:04 +0100176 'input|i=s' => \@input,
Akron63f20d42017-04-10 23:40:29 +0200177 'input-base|ib=s' => \(my $input_base),
Akron941c1a62016-02-23 17:41:41 +0100178 'output|o=s' => \(my $output),
179 'overwrite|w' => \(my $overwrite),
Akron35db6e32016-03-17 22:42:22 +0100180 'meta|m=s' => \(my $meta),
Akron636aa112017-04-07 18:48:56 +0200181 'token|t=s' => \(my $token_base),
182 'base-sentences|bs=s' => \(my $base_sentences),
183 'base-paragraphs|bp=s' => \(my $base_paragraphs),
184 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
Akron941c1a62016-02-23 17:41:41 +0100185 'gzip|z' => \(my $gzip),
Akron81500102017-04-07 20:45:44 +0200186 'temporary-extract|te=s' => \(my $extract_dir),
Akrone10ad322016-02-27 10:54:26 +0100187 'skip|s=s' => \@skip,
188 'sigle|sg=s' => \@sigle,
Akron636aa112017-04-07 18:48:56 +0200189 'cache|c=s' => \(my $cache_file),
190 'config|cfg=s' => \(my $cfg_file),
191 'log|l=s' => \(my $log_level),
Akron5f51d422016-08-16 16:26:43 +0200192 'anno|a=s' => \@anno,
Akron941c1a62016-02-23 17:41:41 +0100193 'primary|p!' => \(my $primary),
194 'pretty|y' => \(my $pretty),
Akron636aa112017-04-07 18:48:56 +0200195 'jobs|j=i' => \(my $jobs),
Akron263274c2019-02-07 09:48:30 +0100196 'koral|k=f' => \(my $koral),
Akron486f9ab2017-04-22 23:25:19 +0200197 'to-tar' => \(my $to_tar),
Akroned9baf02019-01-22 17:03:25 +0100198 'non-word-tokens|nwt' => \(my $non_word_tokens),
Akronf1849aa2019-12-16 23:35:33 +0100199 'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
Akron9ec88872017-04-12 16:29:06 +0200200 'sequential-extraction|se' => \(my $sequential_extraction),
Akron636aa112017-04-07 18:48:56 +0200201 'cache-size|cs=s' => \(my $cache_size),
202 'cache-delete|cd!' => \(my $cache_delete),
203 'cache-init|ci!' => \(my $cache_init),
Akron941c1a62016-02-23 17:41:41 +0100204 'help|h' => sub {
205 pod2usage(
206 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200207 -verbose => 99,
208 -msg => $VERSION_MSG,
209 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100210 );
211 },
212 'version|v' => sub {
213 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200214 -verbose => 0,
215 -msg => $VERSION_MSG,
216 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100217 )
218 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000219);
220
Akron63f20d42017-04-10 23:40:29 +0200221
Akron636aa112017-04-07 18:48:56 +0200222# Load from configuration
223if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200224 my %config;
225
226 Config::Simple->import_from($cfg_file, \%config);
227
228 # Overwrite
229 if (!defined($overwrite) && defined $config{overwrite}) {
230 $overwrite = $config{overwrite};
231 };
232
233 # Gzip
234 if (!defined($gzip) && defined $config{gzip}) {
235 $gzip = $config{gzip};
236 };
237
238 # Jobs
239 if (!defined($jobs) && defined $config{jobs}) {
240 $jobs = $config{jobs};
241 };
242
Akron263274c2019-02-07 09:48:30 +0100243 # Koral version
244 if (!defined($koral) && defined $config{koral}) {
245 $koral = $config{koral};
246 };
247
Akron63f20d42017-04-10 23:40:29 +0200248 # Input root base directory
249 if (!defined($input_base) && defined $config{'input-base'}) {
250 $input_base = $config{'input-base'};
251 };
252
Akron81500102017-04-07 20:45:44 +0200253 # temporary-extract
254 if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
255 $extract_dir = $config{'temporary-extract'};
256 };
257
Akron636aa112017-04-07 18:48:56 +0200258 # Token base
259 if (!defined($token_base) && defined $config{token}) {
260 $token_base = $config{token};
261 };
262
Akronf1849aa2019-12-16 23:35:33 +0100263 # Non-word tokenization
Akroned9baf02019-01-22 17:03:25 +0100264 if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
265 $non_word_tokens = $config{'non-word-tokens'};
266 };
267
Akronf1849aa2019-12-16 23:35:33 +0100268 # Non-verbal tokenization
269 if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
270 $non_verbal_tokens = $config{'non-verbal-tokens'};
271 };
272
Akron636aa112017-04-07 18:48:56 +0200273 # Cache file
274 if (!defined($cache_file) && defined $config{cache}) {
275 $cache_file = $config{cache};
276 };
277
278 # Cache size
279 if (!defined($cache_size) && defined $config{'cache-size'}) {
280 $cache_size = $config{'cache-size'};
281 };
282
283 # Cache delete
284 if (!defined($cache_delete) && defined $config{'cache-delete'}) {
285 $cache_delete = $config{'cache-delete'} ;
286 };
287
288 # Cache init
289 if (!(defined $cache_init) && defined $config{'cache-init'}) {
290 $cache_init = $config{'cache-init'} ;
291 };
292
Akron9ec88872017-04-12 16:29:06 +0200293 # Jobs for extraction
294 if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
295 $sequential_extraction = $config{'sequential-extraction'} ;
296 };
297
Akron636aa112017-04-07 18:48:56 +0200298 # Meta
299 if (!(defined $meta) && defined $config{'meta'}) {
300 $meta = $config{'meta'} ;
301 };
302
303 # Output
304 if (!(defined $output) && defined $config{'output'}) {
305 $output = $config{'output'} ;
306 };
307
308 # Base-sentences
309 if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
310 $base_sentences = $config{'base-sentences'} ;
311 };
312
313 # Base-paragraphs
314 if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
315 $base_paragraphs = $config{'base-paragraphs'} ;
316 };
317
318 # Base-pagebreaks
319 if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
320 $base_pagebreaks = $config{'base-pagebreaks'} ;
321 };
322
Akron081639e2017-04-21 19:01:39 +0200323 # Write to tar
324 if (!(defined $to_tar) && defined $config{'to-tar'}) {
325 $to_tar = $config{'to-tar'} ;
326 };
327
Akron636aa112017-04-07 18:48:56 +0200328 # Log
329 if (!(defined $log_level) && defined $config{'log'}) {
330 $log_level = $config{'log'} ;
331 };
332
333 # Skip
334 if (!scalar(@skip) && defined $config{'skip'}) {
335 @skip = split /\s*;\s*/, $config{'skip'} ;
336 };
337
338 # Sigle
339 if (!scalar(@sigle) && defined $config{'sigle'}) {
340 @sigle = split /\s*;\s*/, $config{'sigle'} ;
341 };
342
343 # Anno
344 if (!scalar(@anno) && defined $config{'anno'}) {
345 @anno = split /\s*;\s*/, $config{'anno'} ;
346 };
347};
348
Akron63f20d42017-04-10 23:40:29 +0200349
Akron636aa112017-04-07 18:48:56 +0200350# Set default token base
Akron9ec88872017-04-12 16:29:06 +0200351$token_base //= 'OpenNLP#tokens';
352$cache_file //= 'korapxml2krill.cache';
353$cache_size //= '50m';
354$jobs //= 0;
Akron263274c2019-02-07 09:48:30 +0100355$koral //= $KORAL_VERSION;
Akron9ec88872017-04-12 16:29:06 +0200356$cache_delete //= 1;
357$cache_init //= 1;
358$sequential_extraction //= 0;
359$log_level //= 'ERROR';
360$base_sentences //= '';
361$base_paragraphs //= '';
362$base_pagebreaks //= '';
Akroned9baf02019-01-22 17:03:25 +0100363$non_word_tokens //= 0;
Akronf1849aa2019-12-16 23:35:33 +0100364$non_verbal_tokens //= 0;
Akron636aa112017-04-07 18:48:56 +0200365
Akron821db3d2017-04-06 21:19:31 +0200366$base_sentences = lc $base_sentences;
Akron3741f8b2016-12-21 19:55:21 +0100367$base_paragraphs = lc $base_paragraphs;
Akron636bd9c2017-02-09 17:13:00 +0100368$base_pagebreaks = lc $base_pagebreaks;
Akron3741f8b2016-12-21 19:55:21 +0100369
Akron63f20d42017-04-10 23:40:29 +0200370
371# Initialize log4perl object
372Log::Log4perl->init({
373 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
374 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
375 'log4perl.appender.STDERR.layout' => 'PatternLayout',
376 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
377});
378
379my $log = Log::Log4perl->get_logger('main');
380
381
382print "Reading config from $cfg_file\n" if $cfg_file;
383
384
Akron941c1a62016-02-23 17:41:41 +0100385my %ERROR_HASH = (
386 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200387 -verbose => 99,
388 -msg => $VERSION_MSG,
389 -output => '-',
390 -exit => 1
Akron941c1a62016-02-23 17:41:41 +0100391);
Nils Diewald7364d1f2013-11-05 19:26:35 +0000392
Akron941c1a62016-02-23 17:41:41 +0100393# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100394pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000395
Akrone1dbc382016-07-08 22:24:52 +0200396# Gzip has no effect, if no output is given
397pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000398
Akronc11f7982017-02-21 21:20:14 +0100399
Akron636aa112017-04-07 18:48:56 +0200400if ($jobs eq '-1') {
Akronc11f7982017-02-21 21:20:14 +0100401 state $cores = Sys::Info->new->device('CPU')->count;
402 $jobs = ceil(5 * $cores);
Akron636aa112017-04-07 18:48:56 +0200403 $log->info("Run using $jobs jobs on $cores cores");
Akronc11f7982017-02-21 21:20:14 +0100404};
405
Akron821db3d2017-04-06 21:19:31 +0200406
Akron63f20d42017-04-10 23:40:29 +0200407# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200408if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200409
Akron486f9ab2017-04-22 23:25:19 +0200410 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200411 $log->error("Directory '$output' does not exist.");
412 exit 1;
Akron63f20d42017-04-10 23:40:29 +0200413 };
414
415 # Remove all inputs
416 my $remove_next = 0;
417 @keep_argv = @{c(@keep_argv)->grep(
418 sub {
419 # Input flag
420 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
421 $remove_next = 1;
422 return 0;
423 }
424
425 # input value
426 elsif ($remove_next) {
427 $remove_next = 0;
428 return 0;
429 };
430
431 # Pass parameter
432 return 1;
433 }
434 )->to_array};
435
436
437 # Iterate over all inputs
438 foreach (@input) {
439
Akron081639e2017-04-21 19:01:39 +0200440 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200441 my $new_out = catdir($output, get_file_name_from_glob($_));
442
Akron486f9ab2017-04-22 23:25:19 +0200443 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200444 unless ($to_tar) {
445 if (make_path($new_out) == 0 && !-d $new_out) {
446 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200447 exit 1;
Akron081639e2017-04-21 19:01:39 +0200448 };
Akron63f20d42017-04-10 23:40:29 +0200449 };
450
451 # Create archive command
452 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
453 print "Start serial processing of $_ to $new_out\n";
454
455 # Start archiving
456 system @archive_cmd;
457 };
458
Akron3abc03e2017-06-29 16:23:35 +0200459 exit;
Akron63f20d42017-04-10 23:40:29 +0200460};
461
Akrone1dbc382016-07-08 22:24:52 +0200462my %skip;
463$skip{lc($_)} = 1 foreach @skip;
464
465my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100466push(@layers, ['Base', 'Sentences']) unless $base_sentences;
467push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200468
469# Connexor
470push(@layers, ['Connexor', 'Morpho']);
471push(@layers, ['Connexor', 'Syntax']);
472push(@layers, ['Connexor', 'Phrase']);
473push(@layers, ['Connexor', 'Sentences']);
474
475# CoreNLP
476push(@layers, ['CoreNLP', 'NamedEntities']);
477push(@layers, ['CoreNLP', 'Sentences']);
478push(@layers, ['CoreNLP', 'Morpho']);
479push(@layers, ['CoreNLP', 'Constituency']);
480
Akronce125b62017-06-19 11:54:36 +0200481# CMC
482push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100483
Akrone1dbc382016-07-08 22:24:52 +0200484# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100485my @dereko_attr = ();
486if ($base_sentences eq 'dereko#structure') {
487 push @dereko_attr, 'sentences';
488};
489if ($base_paragraphs eq 'dereko#structure') {
490 push @dereko_attr, 'paragraphs';
491};
Akron636bd9c2017-02-09 17:13:00 +0100492
Akron41ac10b2017-02-08 22:47:25 +0100493if ($base_pagebreaks eq 'dereko#structure') {
494 push @dereko_attr, 'pagebreaks';
495};
496
497if ($dereko_attr[0]) {
498 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100499}
500else {
501 push(@layers, ['DeReKo', 'Structure']);
502};
Akrone1dbc382016-07-08 22:24:52 +0200503
Akron57510c12019-01-04 14:58:53 +0100504# DGD
505push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100506if ($base_sentences eq 'dgd#structure') {
507 push(@layers, ['DGD', 'Structure', 'base-sentence']);
508}
Akron57510c12019-01-04 14:58:53 +0100509
510# DRuKoLa
511push(@layers, ['DRuKoLa', 'Morpho']);
512
Akrone1dbc382016-07-08 22:24:52 +0200513# Glemm
514push(@layers, ['Glemm', 'Morpho']);
515
Akronea1aed52018-07-19 14:43:34 +0200516# HNC
517push(@layers, ['HNC', 'Morpho']);
518
Akron4c679192018-01-16 17:41:49 +0100519# LWC
520push(@layers, ['LWC', 'Dependency']);
521
Akrone1dbc382016-07-08 22:24:52 +0200522# Malt
523push(@layers, ['Malt', 'Dependency']);
524
Akron57510c12019-01-04 14:58:53 +0100525# Marmot
526push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200527
528# Mate
529push(@layers, ['Mate', 'Morpho']);
530push(@layers, ['Mate', 'Dependency']);
531
Akron57510c12019-01-04 14:58:53 +0100532# MDParser
533push(@layers, ['MDParser', 'Dependency']);
534
Akrone1dbc382016-07-08 22:24:52 +0200535# OpenNLP
536push(@layers, ['OpenNLP', 'Morpho']);
537push(@layers, ['OpenNLP', 'Sentences']);
538
539# Schreibgebrauch
540push(@layers, ['Sgbr', 'Lemma']);
541push(@layers, ['Sgbr', 'Morpho']);
542
Akron7d5e6382019-08-08 16:36:27 +0200543# Talismane
544push(@layers, ['Talismane', 'Dependency']);
545push(@layers, ['Talismane', 'Morpho']);
546
Akrone1dbc382016-07-08 22:24:52 +0200547# TreeTagger
548push(@layers, ['TreeTagger', 'Morpho']);
549push(@layers, ['TreeTagger', 'Sentences']);
550
551# XIP
552push(@layers, ['XIP', 'Morpho']);
553push(@layers, ['XIP', 'Constituency']);
554push(@layers, ['XIP', 'Sentences']);
555push(@layers, ['XIP', 'Dependency']);
556
Akron4fa37c32017-01-20 14:43:10 +0100557
Akrone1dbc382016-07-08 22:24:52 +0200558# Check filters
559my @filtered_anno;
560if ($skip{'#all'}) {
561 foreach (@anno) {
562 push @filtered_anno, [ split('#', $_) ];
563 };
564}
565
566# Add all annotations that are not skipped
567else {
568 # Add to index file - respect skipping
569 foreach my $info (@layers) {
570 # Skip if Foundry or Foundry#Layer should be skipped
571 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
572 push @filtered_anno, $info;
573 };
574 };
575};
576
577# Get tokenization basis
Akron3c56f502017-10-24 15:37:27 +0200578my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
579
580# Remove file extension
581$token_base_layer =~ s/\.xml$//i;
Akrone1dbc382016-07-08 22:24:52 +0200582
583# TODO: This should not be initialized for batch
584my $cache = Cache::FastMmap->new(
585 share_file => $cache_file,
586 cache_size => $cache_size,
Akron03b24db2016-08-16 20:54:32 +0200587 init_file => $cache_init
Akrone1dbc382016-07-08 22:24:52 +0200588);
589
Akron03b24db2016-08-16 20:54:32 +0200590# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200591my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200592 cache => $cache,
Akrone1dbc382016-07-08 22:24:52 +0200593 meta_type => $meta,
594 overwrite => $overwrite,
Akron03b24db2016-08-16 20:54:32 +0200595 foundry => $token_base_foundry,
596 layer => $token_base_layer,
597 gzip => $gzip,
598 log => $log,
Akron263274c2019-02-07 09:48:30 +0100599 koral => $koral,
Akron03b24db2016-08-16 20:54:32 +0200600 primary => $primary,
601 pretty => $pretty,
Akroned9baf02019-01-22 17:03:25 +0100602 anno => \@filtered_anno,
Akronf1849aa2019-12-16 23:35:33 +0100603 non_word_tokens => $non_word_tokens,
604 non_verbal_tokens => $non_verbal_tokens
Akrone1dbc382016-07-08 22:24:52 +0200605);
606
Akron941c1a62016-02-23 17:41:41 +0100607# Get file name based on path information
608sub get_file_name ($) {
Akron08385f62016-03-22 20:37:04 +0100609 my $i = $input[0];
Akron3ec48972016-08-17 23:24:52 +0200610 if (-d $i) {
611 $i =~ s![^\/]+$!!;
612 };
Akron941c1a62016-02-23 17:41:41 +0100613 my $file = shift;
Akron3ec48972016-08-17 23:24:52 +0200614
615 # Remove temp dir fragments
Akron62557602016-06-27 14:10:13 +0200616 $file =~ s!^/?tmp/[^/]+!!;
Akron08385f62016-03-22 20:37:04 +0100617 $file =~ s/^?\/?$i//;
Akron941c1a62016-02-23 17:41:41 +0100618 $file =~ tr/\//-/;
619 $file =~ s{^-+}{};
Akronb4bbec72016-10-26 20:21:02 +0200620 $file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron941c1a62016-02-23 17:41:41 +0100621 return $file;
Nils Diewald59094f22014-11-05 18:20:50 +0000622};
623
Akron63f20d42017-04-10 23:40:29 +0200624
625sub get_file_name_from_glob ($) {
626 my $glob = shift;
Akronbd3adda2017-04-11 15:00:55 +0200627 $glob =~ s![\\\/]!-!g; # Transform paths
Akron63f20d42017-04-10 23:40:29 +0200628 $glob =~ s/[\*\?]//g; # Remove arbitrary fills
629 $glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
630 $glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
631 $glob =~ s/^-//; # Clean beginning
632 $glob =~ s/-$//; # Clean end
Akron9ec88872017-04-12 16:29:06 +0200633 $glob =~ s/\.zip$//; # Remove file extension
Akron63f20d42017-04-10 23:40:29 +0200634 return $glob;
635};
636
637
Akrone10ad322016-02-27 10:54:26 +0100638# Convert sigle to path construct
639s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
640
Akron7d4cdd82016-08-17 21:39:45 +0200641if ($cmd) {
Akron486f9ab2017-04-22 23:25:19 +0200642 if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
Akron3abc03e2017-06-29 16:23:35 +0200643 $log->error("Directory '$output' does not exist.");
644 exit 1;
Akron7d4cdd82016-08-17 21:39:45 +0200645 };
646};
647
Akron63f20d42017-04-10 23:40:29 +0200648
649# Glob and prefix files
Akron821db3d2017-04-06 21:19:31 +0200650if (@input) {
Akron63f20d42017-04-10 23:40:29 +0200651
Akron821db3d2017-04-06 21:19:31 +0200652 my @new_input = ();
653
654 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200655 foreach my $wild_card (@input) {
656
657 # Prefix with input root
658 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
659
660 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200661 };
662
Akron63f20d42017-04-10 23:40:29 +0200663 # Sort files by length
664 @input = sort { length($a) <=> length($b) } @new_input;
665
666 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200667};
668
669
Akron941c1a62016-02-23 17:41:41 +0100670# Process a single file
671unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100672 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000673
Akron941c1a62016-02-23 17:41:41 +0100674 BEGIN {
675 $main::TIME = Benchmark->new;
676 $main::LAST_STOP = Benchmark->new;
677 };
678
679 sub stop_time {
680 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200681 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100682 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200683 timestr(timediff($new, $main::LAST_STOP)) .
684 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
685 );
Akron941c1a62016-02-23 17:41:41 +0100686 $main::LAST_STOP = $new;
687 };
688
689 # Create and parse new document
690 $input =~ s{([^/])$}{$1/};
Akron941c1a62016-02-23 17:41:41 +0100691
Akron7d4cdd82016-08-17 21:39:45 +0200692 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200693 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100694
Akron11c80302016-03-18 19:44:43 +0100695 # Delete cache file
696 unlink($cache_file) if $cache_delete;
697
Akron5f51d422016-08-16 16:26:43 +0200698 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200699 exit;
Akron81500102017-04-07 20:45:44 +0200700};
701
Nils Diewald59094f22014-11-05 18:20:50 +0000702
Akrone10ad322016-02-27 10:54:26 +0100703# Extract XML files
Akron81500102017-04-07 20:45:44 +0200704if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100705
Akrond5643ad2017-07-04 20:27:13 +0200706 # Output is required
707 pod2usage(%ERROR_HASH) unless $output;
708
Akron7d4cdd82016-08-17 21:39:45 +0200709 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200710 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100711
Akron7d4cdd82016-08-17 21:39:45 +0200712 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100713 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200714 $log->error("Unzip is not installed or incompatible.");
715 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100716 };
717
Akronb0c88db2016-06-29 16:33:18 +0200718 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200719 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200720
Akron31a08cb2019-02-20 20:43:26 +0100721 # Will set @sigle
722 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200723
Akrone10ad322016-02-27 10:54:26 +0100724 # Iterate over all given sigles and extract
725 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100726
Akron2812ba22016-10-28 21:55:59 +0200727 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200728
Akron03b24db2016-08-16 20:54:32 +0200729 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200730 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100731
Akron955b75b2019-02-21 14:28:41 +0100732 # TODO:
733 # - prefix???
734 $archive->extract_sigle([$_], $output, $jobs)
735 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200736 );
Akrone10ad322016-02-27 10:54:26 +0100737 print "extracted.\n";
738 };
Akronb0c88db2016-06-29 16:33:18 +0200739 }
Akron7d4cdd82016-08-17 21:39:45 +0200740
741 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200742 else {
743 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200744 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100745 };
746}
747
Akron81500102017-04-07 20:45:44 +0200748
Akron941c1a62016-02-23 17:41:41 +0100749# Process an archive
750elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000751
Akron81500102017-04-07 20:45:44 +0200752 my $archive_output;
753
754 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100755 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200756
757 # Create new archive object
758 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
759
760 # Check zip capabilities
761 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200762 $log->error("Unzip is not installed or incompatible.");
763 exit 1;
Akron81500102017-04-07 20:45:44 +0200764 };
765
766 # Add further annotation archived
767 $archive->attach($_) foreach @input[1..$#input];
768
769 # Create a temporary directory
770 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200771 $extract_dir = tempdir(CLEANUP => 0);
772 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200773 };
774
Akron63f20d42017-04-10 23:40:29 +0200775 # Add some random extra to avoid clashes with multiple archives
776 $extract_dir = catdir($extract_dir, random_string('cccccc'));
777
Akron31a08cb2019-02-20 20:43:26 +0100778 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200779 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200780 @input = ($extract_dir);
781 }
782 else {
783 $log->error('Unable to extract from primary archive ' . $input[0] .
784 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200785 exit 1;
Akron81500102017-04-07 20:45:44 +0200786 };
787 }
788
789 # Can't create archive object
790 else {
791 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200792 exit 1;
Akron81500102017-04-07 20:45:44 +0200793 };
794 };
795
Akron7d4cdd82016-08-17 21:39:45 +0200796 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100797 my $pool = Parallel::ForkManager->new($jobs);
798
Akron7d4cdd82016-08-17 21:39:45 +0200799 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100800 my $iter = 1; # Current text in process
801
Akronda3097e2017-04-23 19:53:57 +0200802 my $tar_archive;
803 my $output_dir = $output;
804 my $tar_fh;
805
806 # Initialize tar archive
807 if ($to_tar) {
808 $tar_archive = Archive::Tar::Builder->new(
809 ignore_errors => 1
810 );
811
812 # Set output name
813 my $tar_file = $output;
814 unless ($tar_file =~ /\.tar$/) {
815 $tar_file .= '.tar';
816 };
817
818 # Initiate the tar file
819 print "Writing to file $tar_file\n";
820 $tar_fh = IO::File->new($tar_file, 'w');
821 $tar_fh->binmode(1);
822
823 # Set handle
824 $tar_archive->set_handle($tar_fh);
825
826 # Output to temporary directory
827 $output_dir = File::Temp->newdir;
828 };
829
Akron941c1a62016-02-23 17:41:41 +0100830 # Report on fork message
831 $pool->run_on_finish (
832 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200833 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100834 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200835
Akron08385f62016-03-22 20:37:04 +0100836 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200837 ($iter++) . "/$count]" .
838 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200839 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200840
841 if (!$code && $to_tar && $data->[2]) {
842 my $filename = $data->[2];
843
844 # Lock filehandle
845 if (flock($tar_fh, LOCK_EX)) {
846
Akron9a062ce2017-07-04 19:12:05 +0200847 my $clean_file = fileparse($filename);
848
Akronda3097e2017-04-23 19:53:57 +0200849 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200850 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200851 unlink $filename;
852
853 # Unlock filehandle
854 flock($tar_fh, LOCK_UN);
855 }
856 else {
857 $log->warn("Unable to add $filename to archive");
858 };
859 };
860
Akron4c0cf312016-10-15 16:42:09 +0200861 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100862 }
863 );
864
865 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200866 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100867 print "Reading data ...\n";
868
Akron7d4cdd82016-08-17 21:39:45 +0200869 # unless (Cache::FastMmap->new(
870 # share_file => $cache_file,
871 # cache_size => $cache_size,
872 # init_file => $cache_init
873 # )) {
874 # print "Unable to intialize cache '$cache_file'\n\n";
875 # exit(1);
876 # };
Akron11c80302016-03-18 19:44:43 +0100877
Akron486f9ab2017-04-22 23:25:19 +0200878
Akron941c1a62016-02-23 17:41:41 +0100879 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100880 if (-d $input[0]) {
881 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100882 my @dirs;
883 my $dir;
884
Akron7d4cdd82016-08-17 21:39:45 +0200885 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100886 while (1) {
887 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200888 push @dirs, $dir;
889 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100890 };
891 last unless $it->next;
892 };
893
894 print "Start processing ...\n";
895 $t = Benchmark->new;
896 $count = scalar @dirs;
897
898 DIRECTORY_LOOP:
899 for (my $i = 0; $i < $count; $i++) {
900
Akrone1dbc382016-07-08 22:24:52 +0200901 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200902 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200903 get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200904 );
Akron941c1a62016-02-23 17:41:41 +0100905
906 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200907 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200908
Akron13d56622016-10-31 14:54:49 +0100909 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200910 $pool->finish(
911 0,
Akronda3097e2017-04-23 19:53:57 +0200912 [
913 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
914 undef,
915 $filename
916 ]
Akron486f9ab2017-04-22 23:25:19 +0200917 );
Akron3ec48972016-08-17 23:24:52 +0200918 }
919 else {
Akron4c0cf312016-10-15 16:42:09 +0200920 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200921 };
Akron941c1a62016-02-23 17:41:41 +0100922 };
923 }
924
925 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200926 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200927
Akron941c1a62016-02-23 17:41:41 +0100928 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200929 $log->error("Unzip is not installed or incompatible.");
930 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100931 };
932
Akron08385f62016-03-22 20:37:04 +0100933 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200934 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100935
Akron31a08cb2019-02-20 20:43:26 +0100936 # Get sigles to extract
937 my $prefix = set_sigle($archive);
938
Akron941c1a62016-02-23 17:41:41 +0100939 print "Start processing ...\n";
940 $t = Benchmark->new;
941 my @dirs = $archive->list_texts;
942 $count = scalar @dirs;
943
944 ARCHIVE_LOOP:
945 for (my $i = 0; $i < $count; $i++) {
946
947 # Split path information
948 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
949
Akrone1dbc382016-07-08 22:24:52 +0200950 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200951 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200952 get_file_name(
953 catfile($corpus, $doc, $text)
954 . '.json' . ($gzip ? '.gz' : '')
955 )
Akrone1dbc382016-07-08 22:24:52 +0200956 );
Akron941c1a62016-02-23 17:41:41 +0100957
958 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200959 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100960
Akron4c0cf312016-10-15 16:42:09 +0200961 # Create temporary file
962 $temp = File::Temp->newdir;
963
Akronbdf434a2016-10-24 17:42:07 +0200964 # TODO: Check if $filename exist at the beginning,
965 # because extraction can be horrible slow!
966
Akron941c1a62016-02-23 17:41:41 +0100967 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100968 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100969
Akron7d4cdd82016-08-17 21:39:45 +0200970 # Create corpus directory
971 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100972
Akron7d4cdd82016-08-17 21:39:45 +0200973 # Temporary directory
974 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100975
Akron7d4cdd82016-08-17 21:39:45 +0200976 # Write file
Akron13d56622016-10-31 14:54:49 +0100977 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200978
Akron4c0cf312016-10-15 16:42:09 +0200979 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100980 $pool->finish(
981 0,
Akronda3097e2017-04-23 19:53:57 +0200982 [
983 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
984 $temp,
985 $filename
986 ]
Akron13d56622016-10-31 14:54:49 +0100987 );
988 #$pool->finish(0, ["Processed " . $filename, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200989 }
990 else {
Akron4c0cf312016-10-15 16:42:09 +0200991 # Delete temporary file
992 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200993 };
Akron941c1a62016-02-23 17:41:41 +0100994 }
Akron7d4cdd82016-08-17 21:39:45 +0200995
996 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100997 else {
Akron4c0cf312016-10-15 16:42:09 +0200998 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100999 };
1000 };
1001 }
1002
1003 else {
1004 print "Input is neither a directory nor an archive.\n\n";
1005 };
1006
1007 $pool->wait_all_children;
1008
Akron11c80302016-03-18 19:44:43 +01001009 # Delete cache file
1010 unlink($cache_file) if $cache_delete;
1011
Akronda3097e2017-04-23 19:53:57 +02001012 # Close tar filehandle
1013 if ($to_tar && $tar_fh) {
1014 $tar_archive->finish;
1015 $tar_fh->close;
1016 print "Wrote to tar archive.\n";
1017 };
1018
Akron63f20d42017-04-10 23:40:29 +02001019 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +01001020 print "Done.\n";
Akron81500102017-04-07 20:45:44 +02001021};
Akron941c1a62016-02-23 17:41:41 +01001022
Nils Diewald2db9ad02013-10-29 19:26:43 +00001023
Akron31a08cb2019-02-20 20:43:26 +01001024# For an archive, this will create the list
1025# of all sigles to process
1026sub set_sigle {
1027 my $archive = shift;
1028
1029 my $prefix = 1;
1030 my @dirs = ();
1031
1032 # No sigles given
1033 unless (@sigle) {
1034
1035 # Get files
1036 foreach ($archive->list_texts) {
1037
1038 push @dirs, $_;
1039
1040 # Split path information
1041 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
1042
1043 # TODO: Make this OS independent
1044 push @sigle, join '/', $corpus, $doc, $text;
1045 };
1046 }
1047
1048 # Check sigle for doc sigles
1049 else {
1050 my @new_sigle;
1051
1052 my $prefix_check = 0;
1053
1054 # Iterate over all sigle
1055 foreach (@sigle) {
1056
1057 # Sigle is a doc sigle
1058 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
1059
1060 print "$_ ...";
1061 # Check if a prefix is needed
1062 unless ($prefix_check) {
1063
1064 if ($prefix = $archive->check_prefix) {
1065 print " with prefix ...";
1066 };
1067 $prefix_check = 1;
1068 };
1069
1070 print "\n";
1071
Akron31a08cb2019-02-20 20:43:26 +01001072 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +01001073 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
1074 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +01001075 );
1076 print "extracted.\n";
1077 }
1078
1079 # Sigle is a text sigle
1080 else {
1081 push @new_sigle, $_;
1082
1083 unless ($prefix_check) {
1084
1085 if ($prefix = $archive->check_prefix) {
1086 print " with prefix ...";
1087 };
1088 $prefix_check = 1;
1089 };
1090 };
1091 };
1092 @sigle = @new_sigle;
1093 };
1094
1095 return $prefix;
1096};
1097
1098
1099
Akron63f20d42017-04-10 23:40:29 +02001100# Cleanup temporary extraction directory
1101if ($extract_dir) {
1102 my $objects = remove_tree($extract_dir, { safe => 1 });
1103 print "Removed directory $extract_dir with $objects objects.\n";
1104};
1105
1106
1107print "\n";
1108
Nils Diewald2db9ad02013-10-29 19:26:43 +00001109__END__
Akron941c1a62016-02-23 17:41:41 +01001110
1111=pod
1112
1113=encoding utf8
1114
1115=head1 NAME
1116
Akron42f48c12020-02-14 13:08:13 +01001117korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001118
1119
1120=head1 SYNOPSIS
1121
Akrona76d8352016-10-27 16:27:32 +02001122 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001123
Akron2fd402b2016-10-27 21:26:48 +02001124
Akron941c1a62016-02-23 17:41:41 +01001125=head1 DESCRIPTION
1126
1127L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1128compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001129The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001130
1131
1132=head1 INSTALLATION
1133
1134The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1135
Akronaf386982016-10-12 00:33:25 +02001136 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001137
Akronc13a1702016-03-15 19:33:14 +01001138In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001139be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001140Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001141In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001142
1143=head1 ARGUMENTS
1144
Akrona76d8352016-10-27 16:27:32 +02001145 $ korapxml2krill -z --input <directory> --output <filename>
1146
1147Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001148It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001149
Akron941c1a62016-02-23 17:41:41 +01001150=over 2
1151
1152=item B<archive>
1153
Akron081639e2017-04-21 19:01:39 +02001154 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001155
Akron2fd402b2016-10-27 21:26:48 +02001156Converts an archive of KorAP-XML documents. It expects a directory
1157(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001158
1159=item B<extract>
1160
Akrona76d8352016-10-27 16:27:32 +02001161 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1162
1163Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001164
Akron63f20d42017-04-10 23:40:29 +02001165=item B<serial>
1166
1167 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1168
1169Convert archives sequentially. The inputs are not merged but treated
1170as they are (so they may be premerged or globs).
1171the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001172are created based on the archive name. In case the C<--to-tar> flag is given,
1173the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001174
1175
Akron941c1a62016-02-23 17:41:41 +01001176=back
1177
1178
1179=head1 OPTIONS
1180
1181=over 2
1182
Akrona76d8352016-10-27 16:27:32 +02001183=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001184
Akrona76d8352016-10-27 16:27:32 +02001185Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001186
Akron7606afa2016-10-25 16:23:49 +02001187Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001188document, while C<archive> expects a KorAP-XML corpus folder or a zip
1189file to batch process multiple files.
1190C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001191
Akrona76d8352016-10-27 16:27:32 +02001192C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001193that the first archive listed contains all primary data files
1194and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001195
Akron7606afa2016-10-25 16:23:49 +02001196 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001197
Akron821db3d2017-04-06 21:19:31 +02001198Input may also be defined using BSD glob wildcards.
1199
1200 -i 'file/news*.zip'
1201
1202The extended input array will be sorted in length order, so the shortest
1203path needs to contain all primary data files and all meta data files.
1204
Akron0c3e3752016-06-28 15:55:53 +02001205(The directory structure follows the base directory format,
1206that may include a C<.> root folder.
1207In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001208need to be passed with a hash sign in front of the archive's name.
1209This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001210
Akron7606afa2016-10-25 16:23:49 +02001211To support zip files, a version of C<unzip> needs to be installed that is
1212compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001213
Akron7606afa2016-10-25 16:23:49 +02001214B<The root folder switch using the hash sign is experimental and
1215may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001216
Akronf73ffb62018-06-27 12:13:59 +02001217
Akron63f20d42017-04-10 23:40:29 +02001218=item B<--input-base|-ib> <directory>
1219
1220The base directory for inputs.
1221
1222
Akron941c1a62016-02-23 17:41:41 +01001223=item B<--output|-o> <directory|file>
1224
1225Output folder for archive processing or
1226document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001227writes to C<STDOUT> by default
1228(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001229
1230=item B<--overwrite|-w>
1231
1232Overwrite files that already exist.
1233
Akronf73ffb62018-06-27 12:13:59 +02001234
Akron3741f8b2016-12-21 19:55:21 +01001235=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001236
1237Define the default tokenization by specifying
1238the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001239of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001240This will directly take the file instead of running
1241the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001242
Akron3741f8b2016-12-21 19:55:21 +01001243
1244=item B<--base-sentences|-bs> <foundry>#<layer>
1245
1246Define the layer for base sentences.
1247If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001248Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1249layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001250
1251 Defaults to unset.
1252
1253
1254=item B<--base-paragraphs|-bp> <foundry>#<layer>
1255
1256Define the layer for base paragraphs.
1257If given, this will be used instead of using C<Base#Paragraphs>.
1258Currently C<DeReKo#Structure> is the only additional layer supported.
1259
1260 Defaults to unset.
1261
1262
Akron41ac10b2017-02-08 22:47:25 +01001263=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1264
1265Define the layer for base pagebreaks.
1266Currently C<DeReKo#Structure> is the only layer supported.
1267
1268 Defaults to unset.
1269
1270
Akron941c1a62016-02-23 17:41:41 +01001271=item B<--skip|-s> <foundry>[#<layer>]
1272
Akronf7ad89e2016-03-16 18:22:47 +01001273Skip specific annotations by specifying the foundry
1274(and optionally the layer with a C<#>-prefix),
1275e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001276Can be set multiple times.
1277
Akronf73ffb62018-06-27 12:13:59 +02001278
Akronc13a1702016-03-15 19:33:14 +01001279=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001280
Akronf7ad89e2016-03-16 18:22:47 +01001281Convert specific annotations by specifying the foundry
1282(and optionally the layer with a C<#>-prefix),
1283e.g. C<Mate> or C<Mate#Morpho>.
1284Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001285
Akronf73ffb62018-06-27 12:13:59 +02001286
Akron941c1a62016-02-23 17:41:41 +01001287=item B<--primary|-p>
1288
Akronc13a1702016-03-15 19:33:14 +01001289Output primary data or not. Defaults to C<true>.
Akronf7ad89e2016-03-16 18:22:47 +01001290Can be flagged using C<--no-primary> as well.
1291This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001292
Akronf73ffb62018-06-27 12:13:59 +02001293
Akroned9baf02019-01-22 17:03:25 +01001294=item B<--non-word-tokens|-nwt>
1295
1296Tokenize non-word tokens like word tokens (defined as matching
1297C</[\d\w]/>). Useful to treat punctuations as tokens.
1298
1299 Defaults to unset.
1300
Akronf1849aa2019-12-16 23:35:33 +01001301
1302=item B<--non-verbal-tokens|-nvt>
1303
1304Tokenize non-verbal tokens marked as in the primary data as
1305the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1306
1307 Defaults to unset.
1308
1309
Akron941c1a62016-02-23 17:41:41 +01001310=item B<--jobs|-j>
1311
1312Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001313for archive processing.
Akron11c80302016-03-18 19:44:43 +01001314Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001315
1316If C<sequential-extraction> is not set to false, this will
1317also apply to extraction.
1318
Akronc11f7982017-02-21 21:20:14 +01001319Pass -1, and the value will be set automatically to 5
1320times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001321This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001322
Akronf73ffb62018-06-27 12:13:59 +02001323
Akron263274c2019-02-07 09:48:30 +01001324=item B<--koral|-k>
1325
1326Version of the output format. Supported versions are:
1327C<0> for legacy serialization, C<0.03> for serialization
1328with metadata fields as key-values on the root object,
1329C<0.4> for serialization with metadata fields as a list
1330of C<"@type":"koral:field"> objects.
1331
1332Currently defaults to C<0.03>.
1333
1334
Akron9ec88872017-04-12 16:29:06 +02001335=item B<--sequential-extraction|-se>
1336
1337Flag to indicate, if the C<jobs> value also applies to extraction.
1338Some systems may have problems with extracting multiple archives
1339to the same folder at the same time.
1340Can be flagged using C<--no-sequential-extraction> as well.
1341Defaults to C<false>.
1342
Akronf73ffb62018-06-27 12:13:59 +02001343
Akron35db6e32016-03-17 22:42:22 +01001344=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001345
Akron35db6e32016-03-17 22:42:22 +01001346Define the metadata parser to use. Defaults to C<I5>.
1347Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1348This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001349
Akronf73ffb62018-06-27 12:13:59 +02001350
Akron941c1a62016-02-23 17:41:41 +01001351=item B<--pretty|-y>
1352
Akronc13a1702016-03-15 19:33:14 +01001353Pretty print JSON output. Defaults to C<false>.
Akron35db6e32016-03-17 22:42:22 +01001354This is I<deprecated>.
Akron941c1a62016-02-23 17:41:41 +01001355
Akronf73ffb62018-06-27 12:13:59 +02001356
Akron941c1a62016-02-23 17:41:41 +01001357=item B<--gzip|-z>
1358
Akronf7ad89e2016-03-16 18:22:47 +01001359Compress the output.
1360Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001361
Akronf73ffb62018-06-27 12:13:59 +02001362
Akron11c80302016-03-18 19:44:43 +01001363=item B<--cache|-c>
1364
1365File to mmap a cache (using L<Cache::FastMmap>).
1366Defaults to C<korapxml2krill.cache> in the calling directory.
1367
Akronf73ffb62018-06-27 12:13:59 +02001368
Akron11c80302016-03-18 19:44:43 +01001369=item B<--cache-size|-cs>
1370
1371Size of the cache. Defaults to C<50m>.
1372
Akronf73ffb62018-06-27 12:13:59 +02001373
Akron11c80302016-03-18 19:44:43 +01001374=item B<--cache-init|-ci>
1375
1376Initialize cache file.
1377Can be flagged using C<--no-cache-init> as well.
1378Defaults to C<true>.
1379
Akronf73ffb62018-06-27 12:13:59 +02001380
Akron11c80302016-03-18 19:44:43 +01001381=item B<--cache-delete|-cd>
1382
1383Delete cache file after processing.
1384Can be flagged using C<--no-cache-delete> as well.
1385Defaults to C<true>.
1386
Akronf73ffb62018-06-27 12:13:59 +02001387
Akron636aa112017-04-07 18:48:56 +02001388=item B<--config|-cfg>
1389
1390Configure the parameters of your call in a file
1391of key-value pairs with whitespace separator
1392
1393 overwrite 1
1394 token DeReKo#Structure
1395 ...
1396
1397Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001398C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001399C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001400C<output>, C<koral>,
1401C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001402C<base-sentences>, C<base-paragraphs>,
1403C<base-pagebreaks>,
1404C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001405(semicolon separated), C<anno> (semicolon separated).
1406
Akronf73ffb62018-06-27 12:13:59 +02001407Configuration parameters will always be overwritten by
1408passed parameters.
1409
1410
Akron81500102017-04-07 20:45:44 +02001411=item B<--temporary-extract|-te>
1412
1413Only valid for the C<archive> command.
1414
1415This will first extract all files into a
1416directory and then will archive.
1417If the directory is given as C<:temp:>,
1418a temporary directory is used.
1419This is especially useful to avoid
1420massive unzipping and potential
1421network latency.
Akron636aa112017-04-07 18:48:56 +02001422
Akronf73ffb62018-06-27 12:13:59 +02001423
Akronc93a0802019-07-11 15:48:34 +02001424=item B<--to-tar>
1425
1426Only valid for the C<archive> command.
1427
1428Writes the output into a tar archive.
1429
1430
Akrone10ad322016-02-27 10:54:26 +01001431=item B<--sigle|-sg>
1432
Akron20807582016-10-26 17:11:34 +02001433Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001434Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001435I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001436Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001437In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001438On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001439
Akronf73ffb62018-06-27 12:13:59 +02001440
Akron941c1a62016-02-23 17:41:41 +01001441=item B<--log|-l>
1442
1443The L<Log4perl> log level, defaults to C<ERROR>.
1444
Akronf73ffb62018-06-27 12:13:59 +02001445
Akron941c1a62016-02-23 17:41:41 +01001446=item B<--help|-h>
1447
Akron42f48c12020-02-14 13:08:13 +01001448Print help information.
Akron941c1a62016-02-23 17:41:41 +01001449
Akronf73ffb62018-06-27 12:13:59 +02001450
Akron941c1a62016-02-23 17:41:41 +01001451=item B<--version|-v>
1452
1453Print version information.
1454
1455=back
1456
Akronf73ffb62018-06-27 12:13:59 +02001457
Akronc13a1702016-03-15 19:33:14 +01001458=head1 ANNOTATION SUPPORT
1459
1460L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1461developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1462The base foundry with paragraphs, sentences, and the text element are mandatory for
1463L<Krill|https://github.com/KorAP/Krill>.
1464
Akron821db3d2017-04-06 21:19:31 +02001465 Base
1466 #Paragraphs
1467 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001468
Akron821db3d2017-04-06 21:19:31 +02001469 Connexor
1470 #Morpho
1471 #Phrase
1472 #Sentences
1473 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001474
Akron821db3d2017-04-06 21:19:31 +02001475 CoreNLP
1476 #Constituency
1477 #Morpho
1478 #NamedEntities
1479 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001480
Akronce125b62017-06-19 11:54:36 +02001481 CMC
1482 #Morpho
1483
Akron821db3d2017-04-06 21:19:31 +02001484 DeReKo
1485 #Structure
Akronc13a1702016-03-15 19:33:14 +01001486
Akron57510c12019-01-04 14:58:53 +01001487 DGD
1488 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001489 #Structure
Akron57510c12019-01-04 14:58:53 +01001490
Akron821db3d2017-04-06 21:19:31 +02001491 DRuKoLa
1492 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001493
Akron821db3d2017-04-06 21:19:31 +02001494 Glemm
1495 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001496
Akronea1aed52018-07-19 14:43:34 +02001497 HNC
1498 #Morpho
1499
Akron4c679192018-01-16 17:41:49 +01001500 LWC
1501 #Dependency
1502
Akron821db3d2017-04-06 21:19:31 +02001503 Malt
1504 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001505
Akron821db3d2017-04-06 21:19:31 +02001506 MarMoT
1507 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001508
Akron821db3d2017-04-06 21:19:31 +02001509 Mate
1510 #Dependency
1511 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001512
Akron821db3d2017-04-06 21:19:31 +02001513 MDParser
1514 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001515
Akron821db3d2017-04-06 21:19:31 +02001516 OpenNLP
1517 #Morpho
1518 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001519
Akron821db3d2017-04-06 21:19:31 +02001520 Sgbr
1521 #Lemma
1522 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001523
Akron7d5e6382019-08-08 16:36:27 +02001524 Talismane
1525 #Dependency
1526 #Morpho
1527
Akron821db3d2017-04-06 21:19:31 +02001528 TreeTagger
1529 #Morpho
1530 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001531
Akron821db3d2017-04-06 21:19:31 +02001532 XIP
1533 #Constituency
1534 #Morpho
1535 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001536
Akronc13a1702016-03-15 19:33:14 +01001537
1538More importers are in preparation.
1539New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1540See the built-in annotation importers as examples.
1541
Akronf73ffb62018-06-27 12:13:59 +02001542
Akron8f69d632020-01-15 16:58:11 +01001543=head1 About KorAP-XML
1544
1545KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1546data model (Bański et al. 2013), where text data are stored physically
1547separated from their interpretations (i.e. annotations).
1548A text document in KorAP-XML therefore consists of several files
1549containing primary data, metadata and annotations.
1550
1551The structure of a single KorAP-XML document can be as follows:
1552
1553 - data.xml
1554 - header.xml
1555 + base
1556 - tokens.xml
1557 - ...
1558 + struct
1559 - structure.xml
1560 - ...
1561 + corenlp
1562 - morpho.xml
1563 - constituency.xml
1564 - ...
1565 + tree_tagger
1566 - morpho.xml
1567 - ...
1568 - ...
1569
1570The C<data.xml> contains the primary data, the C<header.xml> contains
1571the metadata, and the annotation layers are stored in subfolders
1572like C<base>, C<struct> or C<corenlp>
1573(so-called "foundries"; Bański et al. 2013).
1574
1575Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001576(Lüngen and Sperberg-McQueen 2012). See the documentation in
1577L<KorAP::XML::Meta::I5> for translatable fields.
1578
1579Annotations correspond to a variant of the TEI-P5 feature structures
1580(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001581Annotation feature structures refer to character sequences of the primary text
1582inside the C<text> element of the C<data.xml>.
1583A single annotation containing the lemma of a token can have the following structure:
1584
1585 <span from="0" to="3">
1586 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1587 <f name="lex">
1588 <fs>
1589 <f name="lemma">zum</f>
1590 </fs>
1591 </f>
1592 </fs>
1593 </span>
1594
1595The C<from> and C<to> attributes are refering to the character span
1596in the primary text.
1597Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1598the structure may vary. See L<KorAP::XML::Annotation::*> for various
1599annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001600
1601Multiple KorAP-XML documents are organized on three levels following
1602the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1603corpus E<gt> document E<gt> text. On each level metadata information
1604can be stored, that C<korapxml2krill> will merge to a single metadata
1605object per text. A corpus is therefore structured as follows:
1606
1607 + <corpus>
1608 - header.xml
1609 + <document>
1610 - header.xml
1611 + <text>
1612 - data.xml
1613 - header.xml
1614 - ...
1615 - ...
1616
1617A single text can be identified by the concatenation of
1618the corpus identifier, the document identifier and the text identifier.
1619This identifier is called the text sigle
1620(e.g. a text with the identifier C<18486> in the document C<060> in the
1621corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1622
1623These corpora are often stored in zip files, with which C<korapxml2krill>
1624can deal with. Corpora may also be split in multiple zip archives
1625(e.g. one zip file per foundry), which is also supported (see C<--input>).
1626
1627Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1628in form of a test suite.
1629The resulting JSON format merges all annotation layers
1630based on a single token stream.
1631
1632=head2 References
1633
1634Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1635KorAP data model: first approximation, December.
1636
1637Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1638"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1639Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1640L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1641
1642Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1643"Robust corpus architecture: a new look at virtual collections and data access",
1644Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1645L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1646
1647Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1648Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1649"Towards an international standard on featurestructure representation",
1650Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1651pp. 373-376.
1652L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1653
1654Harald Lüngen and C. M. Sperberg-McQueen (2012):
1655"A TEI P5 Document Grammar for the IDS Text Model",
1656Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1657L<PDF|https://journals.openedition.org/jtei/pdf/508>
1658
1659TEI Consortium, eds:
1660"Feature Structures",
1661Guidelines for Electronic Text Encoding and Interchange.
1662L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1663
Akron941c1a62016-02-23 17:41:41 +01001664=head1 AVAILABILITY
1665
1666 https://github.com/KorAP/KorAP-XML-Krill
1667
1668
1669=head1 COPYRIGHT AND LICENSE
1670
Akron8f69d632020-01-15 16:58:11 +01001671Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001672
Akron8f69d632020-01-15 16:58:11 +01001673Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001674
Akrona76d8352016-10-27 16:27:32 +02001675Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001676
1677L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1678Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001679L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001680member of the
Akronf1849aa2019-12-16 23:35:33 +01001681L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001682
1683This program is free software published under the
1684L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1685
1686=cut