blob: b42c7931e553bebd4212265ccb461a784678d810 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000012use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010013use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010014use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010015use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010018use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020019use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020020use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010021use Parallel::ForkManager;
Akronc11f7982017-02-21 21:20:14 +010022use Sys::Info;
23use Sys::Info::Constants qw( :device_cpu );
Akron821db3d2017-04-06 21:19:31 +020024use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020025use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020026use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020027use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020028use Mojo::Collection 'c';
29use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020030use IO::File;
31use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020032use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010033
34# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010035# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010036# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010037
Akronc11f7982017-02-21 21:20:14 +010038# TODO: Use KorAP::XML::ForkPool!
39
Akron941c1a62016-02-23 17:41:41 +010040# CHANGES:
41# ----------------------------------------------------------
42# 2013/11/25
43# - Initial release
44#
45# 2014/10/29
46# - Merges foundry data to create indexer friendly documents
47#
Akron93d620e2016-02-05 19:40:05 +010048# 2016/02/04
49# - renamed to korapxml2krill
50# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010051#
52# 2016/02/12
53# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010054# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010055#
56# 2016/02/14
57# - Added version information
Akron941c1a62016-02-23 17:41:41 +010058# - Added support for archive files
59#
60# 2016/02/15
61# - Fixed temporary directory bug
62# - Improved skipping before unzipping
63# - Added EXPERIMENTAL concurrency support
64#
65# 2016/02/23
66# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010067#
68# 2016/02/27
69# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010070#
71# 2016/03/17
72# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010073#
74# 2016/03/18
75# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020076#
Akronf3f0c942016-06-27 13:27:14 +020077# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020078# - Added multi archive support
79# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020080# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020081#
82# 2016/07/06
83# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020084#
85# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020086# - Fixed temporary path issue in script
87#
88# 2016/10/24
89# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020090#
Akronb4bbec72016-10-26 20:21:02 +020091# 2016/10/24
92# - Added support for document extraction
93#
Akron3741f8b2016-12-21 19:55:21 +010094# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020095# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020096#
Akron3741f8b2016-12-21 19:55:21 +010097# 2016/12/21
98# - added support for base-sentences and base-tokenizations
99#
Akron4fa37c32017-01-20 14:43:10 +0100100# 2017/01/20
101# - added support for DRuKoLa annotations
102#
Akron41ac10b2017-02-08 22:47:25 +0100103# 2017/02/08
104# - added support for pagebreak annotations
105#
Akron821db3d2017-04-06 21:19:31 +0200106# 2017/04/06
107# - added support for wildcards in input
108#
Akron636aa112017-04-07 18:48:56 +0200109# 2017/04/07
110# - support configuration option
Akron81500102017-04-07 20:45:44 +0200111# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200112#
Akron9ec88872017-04-12 16:29:06 +0200113# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200114# - support serial processing
115# - support input root
Akron9ec88872017-04-12 16:29:06 +0200116# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200117#
118# 2017/06/19
119# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200120#
121# 2017/06/29
122# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200123#
124# 2017/07/04
125# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100126#
127# 2018/01/16
128# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200129#
130# 2018/07/19
131# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100132#
133# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100134# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100135# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100136#
Akron63d03ee2019-02-13 18:49:38 +0100137# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100138# - Support for 'koral:field' array.
139# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100140# - Ignore temporary extract parameter on
141# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200142#
143# 2019/08/08
144# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100145#
Akronf1849aa2019-12-16 23:35:33 +0100146# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100147# - Added support for DGD pseudo-sentences
148# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100149# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200150#
151# 2020/04/23
152# - Added support for Redewiedergabe-Korpus structure
153# annotations, based on sentence and paragraph milestones
154# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100155# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100156
Akronf8df2162020-08-07 15:03:39 +0200157our $LAST_CHANGE = '2020/08/07';
Akron941c1a62016-02-23 17:41:41 +0100158our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100159our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100160our $VERSION_MSG = <<"VERSION";
161Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
162VERSION
163
Akron941c1a62016-02-23 17:41:41 +0100164# Parse comand
165my $cmd;
166our @ARGV;
167if ($ARGV[0] && index($ARGV[0], '-') != 0) {
168 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100169};
Akron63f20d42017-04-10 23:40:29 +0200170my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100171
Akron5f51d422016-08-16 16:26:43 +0200172my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200173
174# Configuration hash
175my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100176
Akron941c1a62016-02-23 17:41:41 +0100177# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000178GetOptions(
Akron08385f62016-03-22 20:37:04 +0100179 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200180 'input-base|ib=s' => \($cfg{input_base}),
181 'output|o=s' => \($cfg{output}),
182 'overwrite|w' => \($cfg{overwrite}),
183 'meta|m=s' => \($cfg{meta}),
184 'token|t=s' => \($cfg{token}),
185 'base-sentences|bs=s' => \($cfg{base_sentences}),
186 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
187 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
188 'gzip|z' => \($cfg{gzip}),
189 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100190 'skip|s=s' => \@skip,
191 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200192 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200193 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200194 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200195 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200196 'primary|p!' => sub {
197 warn 'Primary flag no longer supported!';
198 },
Akron6aed0562020-08-07 16:46:00 +0200199 'pretty|y' => sub {
200 warn 'Pretty flag no longer supported!';
201 },
Akronf8df2162020-08-07 15:03:39 +0200202 'jobs|j=i' => \($cfg{jobs}),
203 'koral|k=f' => \($cfg{koral}),
204 'to-tar' => \($cfg{to_tar}),
205 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
206 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
207 'sequential-extraction|se' => \($cfg{sequential_extraction}),
208 'cache-size|cs=s' => \($cfg{cache_size}),
209 'cache-delete|cd!' => \($cfg{cache_delete}),
210 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100211 'help|h' => sub {
212 pod2usage(
213 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200214 -verbose => 99,
215 -msg => $VERSION_MSG,
216 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100217 );
218 },
219 'version|v' => sub {
220 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200221 -verbose => 0,
222 -msg => $VERSION_MSG,
223 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100224 )
225 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000226);
227
Akrone512b7c2020-08-07 16:16:12 +0200228my %ERROR_HASH = (
229 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
230 -verbose => 99,
231 -msg => $VERSION_MSG,
232 -output => '-',
233 -exit => 1
234);
Akron63f20d42017-04-10 23:40:29 +0200235
Akronf8df2162020-08-07 15:03:39 +0200236# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200237if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200238 my %config;
239
Akronf8df2162020-08-07 15:03:39 +0200240 print "Reading config from $cfg_file\n";
241
Akron636aa112017-04-07 18:48:56 +0200242 Config::Simple->import_from($cfg_file, \%config);
243
Akronf8df2162020-08-07 15:03:39 +0200244 foreach (qw!output cache-size input-base token overwrite
245 meta base-sentences base-paragraphs base-pagebreaks
246 gzip to-tar log cache non-word-tokens
247 non-verbal-tokens sequential-extraction cache-init
248 koral extract-dir jobs!) {
249 my $underlined = $_ =~ tr/-/_/r;
250 if (!defined($cfg{$underlined}) && defined $config{$_}) {
251 $cfg{$underlined} = $config{$_};
252 };
Akron636aa112017-04-07 18:48:56 +0200253 };
254
255 # Skip
256 if (!scalar(@skip) && defined $config{'skip'}) {
257 @skip = split /\s*;\s*/, $config{'skip'} ;
258 };
259
260 # Sigle
261 if (!scalar(@sigle) && defined $config{'sigle'}) {
262 @sigle = split /\s*;\s*/, $config{'sigle'} ;
263 };
264
265 # Anno
266 if (!scalar(@anno) && defined $config{'anno'}) {
267 @anno = split /\s*;\s*/, $config{'anno'} ;
268 };
269};
270
Akronf8df2162020-08-07 15:03:39 +0200271# Init variables and set default values
272my $output = $cfg{output};
273my $input_base = $cfg{input_base};
274my $gzip = $cfg{gzip};
275my $to_tar = $cfg{to_tar};
276my $extract_dir = $cfg{extract_dir};
277my $token_base = $cfg{token} // 'OpenNLP#tokens';
278my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
279my $jobs = $cfg{jobs} // 0;
280my $cache_delete = $cfg{cache_delete} // 1;
281my $base_sentences = lc($cfg{base_sentences} // '');
282my $base_paragraphs = lc($cfg{base_paragraphs} // '');
283my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
284my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200285
Akronf8df2162020-08-07 15:03:39 +0200286# Get tokenization basis
287my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200288
Akronf8df2162020-08-07 15:03:39 +0200289# Remove file extension
290$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100291
Akronf8df2162020-08-07 15:03:39 +0200292# Convert sigle to path construct
293s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
294
295my %skip;
296$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200297
298# Initialize log4perl object
299Log::Log4perl->init({
Akronf8df2162020-08-07 15:03:39 +0200300 'log4perl.rootLogger' => uc($cfg{log} // 'ERROR') . ', STDERR',
Akron63f20d42017-04-10 23:40:29 +0200301 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
302 'log4perl.appender.STDERR.layout' => 'PatternLayout',
303 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
304});
305
306my $log = Log::Log4perl->get_logger('main');
307
Akronf8df2162020-08-07 15:03:39 +0200308if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
309 $log->error("Directory '$output' does not exist.");
310 exit 1;
311};
Akron63f20d42017-04-10 23:40:29 +0200312
Akron941c1a62016-02-23 17:41:41 +0100313# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100314pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000315
Akrone1dbc382016-07-08 22:24:52 +0200316# Gzip has no effect, if no output is given
317pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000318
Akronc11f7982017-02-21 21:20:14 +0100319
Akron63f20d42017-04-10 23:40:29 +0200320# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200321if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200322
Akron63f20d42017-04-10 23:40:29 +0200323 # Remove all inputs
324 my $remove_next = 0;
325 @keep_argv = @{c(@keep_argv)->grep(
326 sub {
327 # Input flag
328 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
329 $remove_next = 1;
330 return 0;
331 }
332
333 # input value
334 elsif ($remove_next) {
335 $remove_next = 0;
336 return 0;
337 };
338
339 # Pass parameter
340 return 1;
341 }
342 )->to_array};
343
344
345 # Iterate over all inputs
346 foreach (@input) {
347
Akron081639e2017-04-21 19:01:39 +0200348 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200349 my $new_out = catdir($output, get_file_name_from_glob($_));
350
Akron486f9ab2017-04-22 23:25:19 +0200351 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200352 unless ($to_tar) {
353 if (make_path($new_out) == 0 && !-d $new_out) {
354 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200355 exit 1;
Akron081639e2017-04-21 19:01:39 +0200356 };
Akron63f20d42017-04-10 23:40:29 +0200357 };
358
359 # Create archive command
360 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
361 print "Start serial processing of $_ to $new_out\n";
362
363 # Start archiving
364 system @archive_cmd;
365 };
366
Akron3abc03e2017-06-29 16:23:35 +0200367 exit;
Akron63f20d42017-04-10 23:40:29 +0200368};
369
Akrone512b7c2020-08-07 16:16:12 +0200370
Akrone1dbc382016-07-08 22:24:52 +0200371my @layers;
Akron3741f8b2016-12-21 19:55:21 +0100372push(@layers, ['Base', 'Sentences']) unless $base_sentences;
373push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200374
375# Connexor
376push(@layers, ['Connexor', 'Morpho']);
377push(@layers, ['Connexor', 'Syntax']);
378push(@layers, ['Connexor', 'Phrase']);
379push(@layers, ['Connexor', 'Sentences']);
380
381# CoreNLP
382push(@layers, ['CoreNLP', 'NamedEntities']);
383push(@layers, ['CoreNLP', 'Sentences']);
384push(@layers, ['CoreNLP', 'Morpho']);
385push(@layers, ['CoreNLP', 'Constituency']);
386
Akronce125b62017-06-19 11:54:36 +0200387# CMC
388push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100389
Akrone1dbc382016-07-08 22:24:52 +0200390# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100391my @dereko_attr = ();
392if ($base_sentences eq 'dereko#structure') {
393 push @dereko_attr, 'sentences';
394};
395if ($base_paragraphs eq 'dereko#structure') {
396 push @dereko_attr, 'paragraphs';
397};
Akron636bd9c2017-02-09 17:13:00 +0100398
Akron41ac10b2017-02-08 22:47:25 +0100399if ($base_pagebreaks eq 'dereko#structure') {
400 push @dereko_attr, 'pagebreaks';
401};
402
403if ($dereko_attr[0]) {
404 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100405}
406else {
407 push(@layers, ['DeReKo', 'Structure']);
408};
Akrone1dbc382016-07-08 22:24:52 +0200409
Akron57510c12019-01-04 14:58:53 +0100410# DGD
411push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100412if ($base_sentences eq 'dgd#structure') {
413 push(@layers, ['DGD', 'Structure', 'base-sentence']);
414}
Akron57510c12019-01-04 14:58:53 +0100415
416# DRuKoLa
417push(@layers, ['DRuKoLa', 'Morpho']);
418
Akrone1dbc382016-07-08 22:24:52 +0200419# Glemm
420push(@layers, ['Glemm', 'Morpho']);
421
Akronea1aed52018-07-19 14:43:34 +0200422# HNC
423push(@layers, ['HNC', 'Morpho']);
424
Akron4c679192018-01-16 17:41:49 +0100425# LWC
426push(@layers, ['LWC', 'Dependency']);
427
Akrone1dbc382016-07-08 22:24:52 +0200428# Malt
429push(@layers, ['Malt', 'Dependency']);
430
Akron57510c12019-01-04 14:58:53 +0100431# Marmot
432push(@layers, ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200433
434# Mate
435push(@layers, ['Mate', 'Morpho']);
436push(@layers, ['Mate', 'Dependency']);
437
Akron57510c12019-01-04 14:58:53 +0100438# MDParser
439push(@layers, ['MDParser', 'Dependency']);
440
Akrone1dbc382016-07-08 22:24:52 +0200441# OpenNLP
442push(@layers, ['OpenNLP', 'Morpho']);
443push(@layers, ['OpenNLP', 'Sentences']);
444
Akron07e24772020-04-23 14:00:54 +0200445# Redewiedergabe
446push(@layers, ['RWK', 'Morpho']);
447if ($base_sentences eq 'rwk#structure') {
448 push(@layers, ['RWK', 'Structure']);
449};
450
Akrone1dbc382016-07-08 22:24:52 +0200451# Schreibgebrauch
452push(@layers, ['Sgbr', 'Lemma']);
453push(@layers, ['Sgbr', 'Morpho']);
454
Akron7d5e6382019-08-08 16:36:27 +0200455# Talismane
456push(@layers, ['Talismane', 'Dependency']);
457push(@layers, ['Talismane', 'Morpho']);
458
Akrone1dbc382016-07-08 22:24:52 +0200459# TreeTagger
460push(@layers, ['TreeTagger', 'Morpho']);
461push(@layers, ['TreeTagger', 'Sentences']);
462
463# XIP
464push(@layers, ['XIP', 'Morpho']);
465push(@layers, ['XIP', 'Constituency']);
466push(@layers, ['XIP', 'Sentences']);
467push(@layers, ['XIP', 'Dependency']);
468
Akron4fa37c32017-01-20 14:43:10 +0100469
Akrone1dbc382016-07-08 22:24:52 +0200470# Check filters
471my @filtered_anno;
472if ($skip{'#all'}) {
473 foreach (@anno) {
474 push @filtered_anno, [ split('#', $_) ];
475 };
476}
477
478# Add all annotations that are not skipped
479else {
480 # Add to index file - respect skipping
481 foreach my $info (@layers) {
482 # Skip if Foundry or Foundry#Layer should be skipped
483 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
484 push @filtered_anno, $info;
485 };
486 };
487};
488
Akrone1dbc382016-07-08 22:24:52 +0200489
490# TODO: This should not be initialized for batch
491my $cache = Cache::FastMmap->new(
492 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200493 cache_size => ($cfg{cache_size} // '50m'),
494 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200495);
496
Akron03b24db2016-08-16 20:54:32 +0200497# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200498my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200499 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200500 meta_type => $cfg{meta},
501 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200502 foundry => $token_base_foundry,
503 layer => $token_base_layer,
504 gzip => $gzip,
505 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200506 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100507 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200508 non_word_tokens => ($cfg{non_word_tokens} // 0),
509 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200510);
511
Akrone512b7c2020-08-07 16:16:12 +0200512
513# Auto adjust jobs
514if ($jobs eq '-1') {
515 my $cores = Sys::Info->new->device('CPU')->count;
516 $jobs = ceil(5 * $cores);
517 $log->info("Run using $jobs jobs on $cores cores");
518};
519
520
Akron63f20d42017-04-10 23:40:29 +0200521# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200522if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200523
Akron821db3d2017-04-06 21:19:31 +0200524 my @new_input = ();
525
526 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200527 foreach my $wild_card (@input) {
528
529 # Prefix with input root
530 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
531
532 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200533 };
534
Akron63f20d42017-04-10 23:40:29 +0200535 # Sort files by length
536 @input = sort { length($a) <=> length($b) } @new_input;
537
538 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200539};
540
541
Akron941c1a62016-02-23 17:41:41 +0100542# Process a single file
543unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100544 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000545
Akron941c1a62016-02-23 17:41:41 +0100546 BEGIN {
547 $main::TIME = Benchmark->new;
548 $main::LAST_STOP = Benchmark->new;
549 };
550
551 sub stop_time {
552 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200553 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100554 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200555 timestr(timediff($new, $main::LAST_STOP)) .
556 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
557 );
Akron941c1a62016-02-23 17:41:41 +0100558 $main::LAST_STOP = $new;
559 };
560
561 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200562 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100563
Akron7d4cdd82016-08-17 21:39:45 +0200564 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200565 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100566
Akron11c80302016-03-18 19:44:43 +0100567 # Delete cache file
568 unlink($cache_file) if $cache_delete;
569
Akron5f51d422016-08-16 16:26:43 +0200570 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200571 exit;
Akron81500102017-04-07 20:45:44 +0200572};
573
Nils Diewald59094f22014-11-05 18:20:50 +0000574
Akrone10ad322016-02-27 10:54:26 +0100575# Extract XML files
Akron81500102017-04-07 20:45:44 +0200576if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100577
Akrond5643ad2017-07-04 20:27:13 +0200578 # Output is required
579 pod2usage(%ERROR_HASH) unless $output;
580
Akron7d4cdd82016-08-17 21:39:45 +0200581 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200582 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100583
Akron7d4cdd82016-08-17 21:39:45 +0200584 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100585 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200586 $log->error("Unzip is not installed or incompatible.");
587 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100588 };
589
Akronb0c88db2016-06-29 16:33:18 +0200590 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200591 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200592
Akron31a08cb2019-02-20 20:43:26 +0100593 # Will set @sigle
594 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200595
Akrone10ad322016-02-27 10:54:26 +0100596 # Iterate over all given sigles and extract
597 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100598
Akron2812ba22016-10-28 21:55:59 +0200599 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200600
Akron03b24db2016-08-16 20:54:32 +0200601 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200602 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100603
Akron955b75b2019-02-21 14:28:41 +0100604 # TODO:
605 # - prefix???
606 $archive->extract_sigle([$_], $output, $jobs)
607 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200608 );
Akrone10ad322016-02-27 10:54:26 +0100609 print "extracted.\n";
610 };
Akronb0c88db2016-06-29 16:33:18 +0200611 }
Akron7d4cdd82016-08-17 21:39:45 +0200612
613 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200614 else {
615 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200616 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100617 };
618}
619
Akron81500102017-04-07 20:45:44 +0200620
Akron941c1a62016-02-23 17:41:41 +0100621# Process an archive
622elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000623
Akron81500102017-04-07 20:45:44 +0200624 my $archive_output;
625
626 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100627 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200628
629 # Create new archive object
630 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
631
632 # Check zip capabilities
633 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200634 $log->error("Unzip is not installed or incompatible.");
635 exit 1;
Akron81500102017-04-07 20:45:44 +0200636 };
637
638 # Add further annotation archived
639 $archive->attach($_) foreach @input[1..$#input];
640
641 # Create a temporary directory
642 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200643 $extract_dir = tempdir(CLEANUP => 0);
644 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200645 };
646
Akron63f20d42017-04-10 23:40:29 +0200647 # Add some random extra to avoid clashes with multiple archives
648 $extract_dir = catdir($extract_dir, random_string('cccccc'));
649
Akron31a08cb2019-02-20 20:43:26 +0100650 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200651 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200652 @input = ($extract_dir);
653 }
654 else {
655 $log->error('Unable to extract from primary archive ' . $input[0] .
656 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200657 exit 1;
Akron81500102017-04-07 20:45:44 +0200658 };
659 }
660
661 # Can't create archive object
662 else {
663 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200664 exit 1;
Akron81500102017-04-07 20:45:44 +0200665 };
666 };
667
Akron7d4cdd82016-08-17 21:39:45 +0200668 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100669 my $pool = Parallel::ForkManager->new($jobs);
670
Akron7d4cdd82016-08-17 21:39:45 +0200671 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100672 my $iter = 1; # Current text in process
673
Akronda3097e2017-04-23 19:53:57 +0200674 my $tar_archive;
675 my $output_dir = $output;
676 my $tar_fh;
677
678 # Initialize tar archive
679 if ($to_tar) {
680 $tar_archive = Archive::Tar::Builder->new(
681 ignore_errors => 1
682 );
683
684 # Set output name
685 my $tar_file = $output;
686 unless ($tar_file =~ /\.tar$/) {
687 $tar_file .= '.tar';
688 };
689
690 # Initiate the tar file
691 print "Writing to file $tar_file\n";
692 $tar_fh = IO::File->new($tar_file, 'w');
693 $tar_fh->binmode(1);
694
695 # Set handle
696 $tar_archive->set_handle($tar_fh);
697
698 # Output to temporary directory
699 $output_dir = File::Temp->newdir;
700 };
701
Akron941c1a62016-02-23 17:41:41 +0100702 # Report on fork message
703 $pool->run_on_finish (
704 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200705 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100706 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200707
Akron08385f62016-03-22 20:37:04 +0100708 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200709 ($iter++) . "/$count]" .
710 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200711 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200712
713 if (!$code && $to_tar && $data->[2]) {
714 my $filename = $data->[2];
715
716 # Lock filehandle
717 if (flock($tar_fh, LOCK_EX)) {
718
Akron9a062ce2017-07-04 19:12:05 +0200719 my $clean_file = fileparse($filename);
720
Akronda3097e2017-04-23 19:53:57 +0200721 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200722 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200723 unlink $filename;
724
725 # Unlock filehandle
726 flock($tar_fh, LOCK_UN);
727 }
728 else {
729 $log->warn("Unable to add $filename to archive");
730 };
731 };
732
Akron4c0cf312016-10-15 16:42:09 +0200733 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100734 }
735 );
736
737 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200738 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100739 print "Reading data ...\n";
740
Akron7d4cdd82016-08-17 21:39:45 +0200741 # unless (Cache::FastMmap->new(
742 # share_file => $cache_file,
743 # cache_size => $cache_size,
744 # init_file => $cache_init
745 # )) {
746 # print "Unable to intialize cache '$cache_file'\n\n";
747 # exit(1);
748 # };
Akron11c80302016-03-18 19:44:43 +0100749
Akron486f9ab2017-04-22 23:25:19 +0200750
Akron941c1a62016-02-23 17:41:41 +0100751 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100752 if (-d $input[0]) {
753 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100754 my @dirs;
755 my $dir;
756
Akron7d4cdd82016-08-17 21:39:45 +0200757 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100758 while (1) {
759 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200760 push @dirs, $dir;
761 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100762 };
763 last unless $it->next;
764 };
765
766 print "Start processing ...\n";
767 $t = Benchmark->new;
768 $count = scalar @dirs;
769
770 DIRECTORY_LOOP:
771 for (my $i = 0; $i < $count; $i++) {
772
Akrone1dbc382016-07-08 22:24:52 +0200773 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200774 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200775 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200776 );
Akron941c1a62016-02-23 17:41:41 +0100777
778 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200779 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200780
Akron13d56622016-10-31 14:54:49 +0100781 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200782 $pool->finish(
783 0,
Akronda3097e2017-04-23 19:53:57 +0200784 [
785 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
786 undef,
787 $filename
788 ]
Akron486f9ab2017-04-22 23:25:19 +0200789 );
Akron3ec48972016-08-17 23:24:52 +0200790 }
791 else {
Akron4c0cf312016-10-15 16:42:09 +0200792 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200793 };
Akron941c1a62016-02-23 17:41:41 +0100794 };
795 }
796
797 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200798 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200799
Akron941c1a62016-02-23 17:41:41 +0100800 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200801 $log->error("Unzip is not installed or incompatible.");
802 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100803 };
804
Akron08385f62016-03-22 20:37:04 +0100805 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200806 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100807
Akron31a08cb2019-02-20 20:43:26 +0100808 # Get sigles to extract
809 my $prefix = set_sigle($archive);
810
Akron941c1a62016-02-23 17:41:41 +0100811 print "Start processing ...\n";
812 $t = Benchmark->new;
813 my @dirs = $archive->list_texts;
814 $count = scalar @dirs;
815
816 ARCHIVE_LOOP:
817 for (my $i = 0; $i < $count; $i++) {
818
819 # Split path information
820 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
821
Akrone1dbc382016-07-08 22:24:52 +0200822 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200823 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200824 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200825 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200826 catfile($corpus, $doc, $text)
827 . '.json' . ($gzip ? '.gz' : '')
828 )
Akrone1dbc382016-07-08 22:24:52 +0200829 );
Akron941c1a62016-02-23 17:41:41 +0100830
831 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200832 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100833
Akron4c0cf312016-10-15 16:42:09 +0200834 # Create temporary file
835 $temp = File::Temp->newdir;
836
Akronbdf434a2016-10-24 17:42:07 +0200837 # TODO: Check if $filename exist at the beginning,
838 # because extraction can be horrible slow!
839
Akron941c1a62016-02-23 17:41:41 +0100840 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100841 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100842
Akron7d4cdd82016-08-17 21:39:45 +0200843 # Create corpus directory
844 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100845
Akron7d4cdd82016-08-17 21:39:45 +0200846 # Temporary directory
847 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100848
Akron7d4cdd82016-08-17 21:39:45 +0200849 # Write file
Akron13d56622016-10-31 14:54:49 +0100850 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200851
Akron4c0cf312016-10-15 16:42:09 +0200852 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100853 $pool->finish(
854 0,
Akronda3097e2017-04-23 19:53:57 +0200855 [
856 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
857 $temp,
858 $filename
859 ]
Akron13d56622016-10-31 14:54:49 +0100860 );
Akron7d4cdd82016-08-17 21:39:45 +0200861 }
862 else {
Akron4c0cf312016-10-15 16:42:09 +0200863 # Delete temporary file
864 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200865 };
Akron941c1a62016-02-23 17:41:41 +0100866 }
Akron7d4cdd82016-08-17 21:39:45 +0200867
868 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100869 else {
Akron4c0cf312016-10-15 16:42:09 +0200870 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100871 };
872 };
873 }
874
875 else {
876 print "Input is neither a directory nor an archive.\n\n";
877 };
878
879 $pool->wait_all_children;
880
Akron11c80302016-03-18 19:44:43 +0100881 # Delete cache file
882 unlink($cache_file) if $cache_delete;
883
Akronda3097e2017-04-23 19:53:57 +0200884 # Close tar filehandle
885 if ($to_tar && $tar_fh) {
886 $tar_archive->finish;
887 $tar_fh->close;
888 print "Wrote to tar archive.\n";
889 };
890
Akron63f20d42017-04-10 23:40:29 +0200891 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100892 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200893};
Akron941c1a62016-02-23 17:41:41 +0100894
Nils Diewald2db9ad02013-10-29 19:26:43 +0000895
Akron31a08cb2019-02-20 20:43:26 +0100896# For an archive, this will create the list
897# of all sigles to process
898sub set_sigle {
899 my $archive = shift;
900
901 my $prefix = 1;
902 my @dirs = ();
903
904 # No sigles given
905 unless (@sigle) {
906
907 # Get files
908 foreach ($archive->list_texts) {
909
910 push @dirs, $_;
911
912 # Split path information
913 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
914
915 # TODO: Make this OS independent
916 push @sigle, join '/', $corpus, $doc, $text;
917 };
918 }
919
920 # Check sigle for doc sigles
921 else {
922 my @new_sigle;
923
924 my $prefix_check = 0;
925
926 # Iterate over all sigle
927 foreach (@sigle) {
928
929 # Sigle is a doc sigle
930 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
931
932 print "$_ ...";
933 # Check if a prefix is needed
934 unless ($prefix_check) {
935
936 if ($prefix = $archive->check_prefix) {
937 print " with prefix ...";
938 };
939 $prefix_check = 1;
940 };
941
942 print "\n";
943
Akron31a08cb2019-02-20 20:43:26 +0100944 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100945 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
946 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100947 );
948 print "extracted.\n";
949 }
950
951 # Sigle is a text sigle
952 else {
953 push @new_sigle, $_;
954
955 unless ($prefix_check) {
956
957 if ($prefix = $archive->check_prefix) {
958 print " with prefix ...";
959 };
960 $prefix_check = 1;
961 };
962 };
963 };
964 @sigle = @new_sigle;
965 };
966
967 return $prefix;
968};
969
970
Akron63f20d42017-04-10 23:40:29 +0200971# Cleanup temporary extraction directory
972if ($extract_dir) {
973 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +0200974 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +0200975};
976
977
978print "\n";
979
Nils Diewald2db9ad02013-10-29 19:26:43 +0000980__END__
Akron941c1a62016-02-23 17:41:41 +0100981
982=pod
983
984=encoding utf8
985
986=head1 NAME
987
Akron42f48c12020-02-14 13:08:13 +0100988korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +0100989
990
991=head1 SYNOPSIS
992
Akrona76d8352016-10-27 16:27:32 +0200993 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +0100994
Akron2fd402b2016-10-27 21:26:48 +0200995
Akron941c1a62016-02-23 17:41:41 +0100996=head1 DESCRIPTION
997
998L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
999compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001000The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001001
1002
1003=head1 INSTALLATION
1004
1005The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1006
Akronaf386982016-10-12 00:33:25 +02001007 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001008
Akronc13a1702016-03-15 19:33:14 +01001009In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001010be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001011Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akrona93d51b2016-10-24 20:27:48 +02001012In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001013
1014=head1 ARGUMENTS
1015
Akrona76d8352016-10-27 16:27:32 +02001016 $ korapxml2krill -z --input <directory> --output <filename>
1017
1018Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001019It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001020
Akron941c1a62016-02-23 17:41:41 +01001021=over 2
1022
1023=item B<archive>
1024
Akron081639e2017-04-21 19:01:39 +02001025 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001026
Akron2fd402b2016-10-27 21:26:48 +02001027Converts an archive of KorAP-XML documents. It expects a directory
1028(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001029
1030=item B<extract>
1031
Akrona76d8352016-10-27 16:27:32 +02001032 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1033
1034Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001035
Akron63f20d42017-04-10 23:40:29 +02001036=item B<serial>
1037
1038 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1039
1040Convert archives sequentially. The inputs are not merged but treated
1041as they are (so they may be premerged or globs).
1042the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001043are created based on the archive name. In case the C<--to-tar> flag is given,
1044the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001045
1046
Akron941c1a62016-02-23 17:41:41 +01001047=back
1048
1049
1050=head1 OPTIONS
1051
1052=over 2
1053
Akrona76d8352016-10-27 16:27:32 +02001054=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001055
Akrona76d8352016-10-27 16:27:32 +02001056Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001057
Akron7606afa2016-10-25 16:23:49 +02001058Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001059document, while C<archive> expects a KorAP-XML corpus folder or a zip
1060file to batch process multiple files.
1061C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001062
Akrona76d8352016-10-27 16:27:32 +02001063C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001064that the first archive listed contains all primary data files
1065and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001066
Akron7606afa2016-10-25 16:23:49 +02001067 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001068
Akron821db3d2017-04-06 21:19:31 +02001069Input may also be defined using BSD glob wildcards.
1070
1071 -i 'file/news*.zip'
1072
1073The extended input array will be sorted in length order, so the shortest
1074path needs to contain all primary data files and all meta data files.
1075
Akron0c3e3752016-06-28 15:55:53 +02001076(The directory structure follows the base directory format,
1077that may include a C<.> root folder.
1078In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001079need to be passed with a hash sign in front of the archive's name.
1080This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001081
Akron7606afa2016-10-25 16:23:49 +02001082To support zip files, a version of C<unzip> needs to be installed that is
1083compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001084
Akron7606afa2016-10-25 16:23:49 +02001085B<The root folder switch using the hash sign is experimental and
1086may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001087
Akronf73ffb62018-06-27 12:13:59 +02001088
Akron63f20d42017-04-10 23:40:29 +02001089=item B<--input-base|-ib> <directory>
1090
1091The base directory for inputs.
1092
1093
Akron941c1a62016-02-23 17:41:41 +01001094=item B<--output|-o> <directory|file>
1095
1096Output folder for archive processing or
1097document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001098writes to C<STDOUT> by default
1099(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001100
1101=item B<--overwrite|-w>
1102
1103Overwrite files that already exist.
1104
Akronf73ffb62018-06-27 12:13:59 +02001105
Akron3741f8b2016-12-21 19:55:21 +01001106=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001107
1108Define the default tokenization by specifying
1109the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001110of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001111This will directly take the file instead of running
1112the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001113
Akron3741f8b2016-12-21 19:55:21 +01001114
1115=item B<--base-sentences|-bs> <foundry>#<layer>
1116
1117Define the layer for base sentences.
1118If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001119Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1120layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001121
1122 Defaults to unset.
1123
1124
1125=item B<--base-paragraphs|-bp> <foundry>#<layer>
1126
1127Define the layer for base paragraphs.
1128If given, this will be used instead of using C<Base#Paragraphs>.
1129Currently C<DeReKo#Structure> is the only additional layer supported.
1130
1131 Defaults to unset.
1132
1133
Akron41ac10b2017-02-08 22:47:25 +01001134=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1135
1136Define the layer for base pagebreaks.
1137Currently C<DeReKo#Structure> is the only layer supported.
1138
1139 Defaults to unset.
1140
1141
Akron941c1a62016-02-23 17:41:41 +01001142=item B<--skip|-s> <foundry>[#<layer>]
1143
Akronf7ad89e2016-03-16 18:22:47 +01001144Skip specific annotations by specifying the foundry
1145(and optionally the layer with a C<#>-prefix),
1146e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001147Can be set multiple times.
1148
Akronf73ffb62018-06-27 12:13:59 +02001149
Akronc13a1702016-03-15 19:33:14 +01001150=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001151
Akronf7ad89e2016-03-16 18:22:47 +01001152Convert specific annotations by specifying the foundry
1153(and optionally the layer with a C<#>-prefix),
1154e.g. C<Mate> or C<Mate#Morpho>.
1155Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001156
Akronf73ffb62018-06-27 12:13:59 +02001157
Akroned9baf02019-01-22 17:03:25 +01001158=item B<--non-word-tokens|-nwt>
1159
1160Tokenize non-word tokens like word tokens (defined as matching
1161C</[\d\w]/>). Useful to treat punctuations as tokens.
1162
1163 Defaults to unset.
1164
Akronf1849aa2019-12-16 23:35:33 +01001165
1166=item B<--non-verbal-tokens|-nvt>
1167
1168Tokenize non-verbal tokens marked as in the primary data as
1169the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1170
1171 Defaults to unset.
1172
1173
Akron941c1a62016-02-23 17:41:41 +01001174=item B<--jobs|-j>
1175
1176Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001177for archive processing.
Akron11c80302016-03-18 19:44:43 +01001178Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001179
1180If C<sequential-extraction> is not set to false, this will
1181also apply to extraction.
1182
Akronc11f7982017-02-21 21:20:14 +01001183Pass -1, and the value will be set automatically to 5
1184times the number of available cores.
Akronf7ad89e2016-03-16 18:22:47 +01001185This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001186
Akronf73ffb62018-06-27 12:13:59 +02001187
Akron263274c2019-02-07 09:48:30 +01001188=item B<--koral|-k>
1189
1190Version of the output format. Supported versions are:
1191C<0> for legacy serialization, C<0.03> for serialization
1192with metadata fields as key-values on the root object,
1193C<0.4> for serialization with metadata fields as a list
1194of C<"@type":"koral:field"> objects.
1195
1196Currently defaults to C<0.03>.
1197
1198
Akron9ec88872017-04-12 16:29:06 +02001199=item B<--sequential-extraction|-se>
1200
1201Flag to indicate, if the C<jobs> value also applies to extraction.
1202Some systems may have problems with extracting multiple archives
1203to the same folder at the same time.
1204Can be flagged using C<--no-sequential-extraction> as well.
1205Defaults to C<false>.
1206
Akronf73ffb62018-06-27 12:13:59 +02001207
Akron35db6e32016-03-17 22:42:22 +01001208=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001209
Akron35db6e32016-03-17 22:42:22 +01001210Define the metadata parser to use. Defaults to C<I5>.
1211Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1212This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001213
Akronf73ffb62018-06-27 12:13:59 +02001214
Akron941c1a62016-02-23 17:41:41 +01001215=item B<--gzip|-z>
1216
Akronf7ad89e2016-03-16 18:22:47 +01001217Compress the output.
1218Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001219
Akronf73ffb62018-06-27 12:13:59 +02001220
Akron11c80302016-03-18 19:44:43 +01001221=item B<--cache|-c>
1222
1223File to mmap a cache (using L<Cache::FastMmap>).
1224Defaults to C<korapxml2krill.cache> in the calling directory.
1225
Akronf73ffb62018-06-27 12:13:59 +02001226
Akron11c80302016-03-18 19:44:43 +01001227=item B<--cache-size|-cs>
1228
1229Size of the cache. Defaults to C<50m>.
1230
Akronf73ffb62018-06-27 12:13:59 +02001231
Akron11c80302016-03-18 19:44:43 +01001232=item B<--cache-init|-ci>
1233
1234Initialize cache file.
1235Can be flagged using C<--no-cache-init> as well.
1236Defaults to C<true>.
1237
Akronf73ffb62018-06-27 12:13:59 +02001238
Akron11c80302016-03-18 19:44:43 +01001239=item B<--cache-delete|-cd>
1240
1241Delete cache file after processing.
1242Can be flagged using C<--no-cache-delete> as well.
1243Defaults to C<true>.
1244
Akronf73ffb62018-06-27 12:13:59 +02001245
Akron636aa112017-04-07 18:48:56 +02001246=item B<--config|-cfg>
1247
1248Configure the parameters of your call in a file
1249of key-value pairs with whitespace separator
1250
1251 overwrite 1
1252 token DeReKo#Structure
1253 ...
1254
1255Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001256C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001257C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001258C<output>, C<koral>,
1259C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001260C<base-sentences>, C<base-paragraphs>,
1261C<base-pagebreaks>,
1262C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001263(semicolon separated), C<anno> (semicolon separated).
1264
Akronf73ffb62018-06-27 12:13:59 +02001265Configuration parameters will always be overwritten by
1266passed parameters.
1267
1268
Akron81500102017-04-07 20:45:44 +02001269=item B<--temporary-extract|-te>
1270
1271Only valid for the C<archive> command.
1272
1273This will first extract all files into a
1274directory and then will archive.
1275If the directory is given as C<:temp:>,
1276a temporary directory is used.
1277This is especially useful to avoid
1278massive unzipping and potential
1279network latency.
Akron636aa112017-04-07 18:48:56 +02001280
Akronf73ffb62018-06-27 12:13:59 +02001281
Akronc93a0802019-07-11 15:48:34 +02001282=item B<--to-tar>
1283
1284Only valid for the C<archive> command.
1285
1286Writes the output into a tar archive.
1287
1288
Akrone10ad322016-02-27 10:54:26 +01001289=item B<--sigle|-sg>
1290
Akron20807582016-10-26 17:11:34 +02001291Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001292Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001293I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001294Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001295In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001296On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001297
Akronf73ffb62018-06-27 12:13:59 +02001298
Akron941c1a62016-02-23 17:41:41 +01001299=item B<--log|-l>
1300
1301The L<Log4perl> log level, defaults to C<ERROR>.
1302
Akronf73ffb62018-06-27 12:13:59 +02001303
Akron941c1a62016-02-23 17:41:41 +01001304=item B<--help|-h>
1305
Akron42f48c12020-02-14 13:08:13 +01001306Print help information.
Akron941c1a62016-02-23 17:41:41 +01001307
Akronf73ffb62018-06-27 12:13:59 +02001308
Akron941c1a62016-02-23 17:41:41 +01001309=item B<--version|-v>
1310
1311Print version information.
1312
1313=back
1314
Akronf73ffb62018-06-27 12:13:59 +02001315
Akronc13a1702016-03-15 19:33:14 +01001316=head1 ANNOTATION SUPPORT
1317
1318L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1319developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1320The base foundry with paragraphs, sentences, and the text element are mandatory for
1321L<Krill|https://github.com/KorAP/Krill>.
1322
Akron821db3d2017-04-06 21:19:31 +02001323 Base
1324 #Paragraphs
1325 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001326
Akron821db3d2017-04-06 21:19:31 +02001327 Connexor
1328 #Morpho
1329 #Phrase
1330 #Sentences
1331 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001332
Akron821db3d2017-04-06 21:19:31 +02001333 CoreNLP
1334 #Constituency
1335 #Morpho
1336 #NamedEntities
1337 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001338
Akronce125b62017-06-19 11:54:36 +02001339 CMC
1340 #Morpho
1341
Akron821db3d2017-04-06 21:19:31 +02001342 DeReKo
1343 #Structure
Akronc13a1702016-03-15 19:33:14 +01001344
Akron57510c12019-01-04 14:58:53 +01001345 DGD
1346 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001347 #Structure
Akron57510c12019-01-04 14:58:53 +01001348
Akron821db3d2017-04-06 21:19:31 +02001349 DRuKoLa
1350 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001351
Akron821db3d2017-04-06 21:19:31 +02001352 Glemm
1353 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001354
Akronea1aed52018-07-19 14:43:34 +02001355 HNC
1356 #Morpho
1357
Akron4c679192018-01-16 17:41:49 +01001358 LWC
1359 #Dependency
1360
Akron821db3d2017-04-06 21:19:31 +02001361 Malt
1362 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001363
Akron821db3d2017-04-06 21:19:31 +02001364 MarMoT
1365 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001366
Akron821db3d2017-04-06 21:19:31 +02001367 Mate
1368 #Dependency
1369 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001370
Akron821db3d2017-04-06 21:19:31 +02001371 MDParser
1372 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001373
Akron821db3d2017-04-06 21:19:31 +02001374 OpenNLP
1375 #Morpho
1376 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001377
Akron07e24772020-04-23 14:00:54 +02001378 RWK
1379 #Morpho
1380 #Structure
1381
Akron821db3d2017-04-06 21:19:31 +02001382 Sgbr
1383 #Lemma
1384 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001385
Akron7d5e6382019-08-08 16:36:27 +02001386 Talismane
1387 #Dependency
1388 #Morpho
1389
Akron821db3d2017-04-06 21:19:31 +02001390 TreeTagger
1391 #Morpho
1392 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001393
Akron821db3d2017-04-06 21:19:31 +02001394 XIP
1395 #Constituency
1396 #Morpho
1397 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001398
Akronc13a1702016-03-15 19:33:14 +01001399
1400More importers are in preparation.
1401New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1402See the built-in annotation importers as examples.
1403
Akronf73ffb62018-06-27 12:13:59 +02001404
Akron8f69d632020-01-15 16:58:11 +01001405=head1 About KorAP-XML
1406
1407KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1408data model (Bański et al. 2013), where text data are stored physically
1409separated from their interpretations (i.e. annotations).
1410A text document in KorAP-XML therefore consists of several files
1411containing primary data, metadata and annotations.
1412
1413The structure of a single KorAP-XML document can be as follows:
1414
1415 - data.xml
1416 - header.xml
1417 + base
1418 - tokens.xml
1419 - ...
1420 + struct
1421 - structure.xml
1422 - ...
1423 + corenlp
1424 - morpho.xml
1425 - constituency.xml
1426 - ...
1427 + tree_tagger
1428 - morpho.xml
1429 - ...
1430 - ...
1431
1432The C<data.xml> contains the primary data, the C<header.xml> contains
1433the metadata, and the annotation layers are stored in subfolders
1434like C<base>, C<struct> or C<corenlp>
1435(so-called "foundries"; Bański et al. 2013).
1436
1437Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001438(Lüngen and Sperberg-McQueen 2012). See the documentation in
1439L<KorAP::XML::Meta::I5> for translatable fields.
1440
1441Annotations correspond to a variant of the TEI-P5 feature structures
1442(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001443Annotation feature structures refer to character sequences of the primary text
1444inside the C<text> element of the C<data.xml>.
1445A single annotation containing the lemma of a token can have the following structure:
1446
1447 <span from="0" to="3">
1448 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1449 <f name="lex">
1450 <fs>
1451 <f name="lemma">zum</f>
1452 </fs>
1453 </f>
1454 </fs>
1455 </span>
1456
1457The C<from> and C<to> attributes are refering to the character span
1458in the primary text.
1459Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1460the structure may vary. See L<KorAP::XML::Annotation::*> for various
1461annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001462
1463Multiple KorAP-XML documents are organized on three levels following
1464the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1465corpus E<gt> document E<gt> text. On each level metadata information
1466can be stored, that C<korapxml2krill> will merge to a single metadata
1467object per text. A corpus is therefore structured as follows:
1468
1469 + <corpus>
1470 - header.xml
1471 + <document>
1472 - header.xml
1473 + <text>
1474 - data.xml
1475 - header.xml
1476 - ...
1477 - ...
1478
1479A single text can be identified by the concatenation of
1480the corpus identifier, the document identifier and the text identifier.
1481This identifier is called the text sigle
1482(e.g. a text with the identifier C<18486> in the document C<060> in the
1483corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1484
1485These corpora are often stored in zip files, with which C<korapxml2krill>
1486can deal with. Corpora may also be split in multiple zip archives
1487(e.g. one zip file per foundry), which is also supported (see C<--input>).
1488
1489Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1490in form of a test suite.
1491The resulting JSON format merges all annotation layers
1492based on a single token stream.
1493
1494=head2 References
1495
1496Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1497KorAP data model: first approximation, December.
1498
1499Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1500"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1501Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1502L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1503
1504Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1505"Robust corpus architecture: a new look at virtual collections and data access",
1506Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1507L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1508
1509Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1510Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1511"Towards an international standard on featurestructure representation",
1512Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1513pp. 373-376.
1514L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1515
1516Harald Lüngen and C. M. Sperberg-McQueen (2012):
1517"A TEI P5 Document Grammar for the IDS Text Model",
1518Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1519L<PDF|https://journals.openedition.org/jtei/pdf/508>
1520
1521TEI Consortium, eds:
1522"Feature Structures",
1523Guidelines for Electronic Text Encoding and Interchange.
1524L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1525
Akron941c1a62016-02-23 17:41:41 +01001526=head1 AVAILABILITY
1527
1528 https://github.com/KorAP/KorAP-XML-Krill
1529
1530
1531=head1 COPYRIGHT AND LICENSE
1532
Akron8f69d632020-01-15 16:58:11 +01001533Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001534
Akron8f69d632020-01-15 16:58:11 +01001535Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001536
Akrona76d8352016-10-27 16:27:32 +02001537Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001538
1539L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1540Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001541L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001542member of the
Akronf1849aa2019-12-16 23:35:33 +01001543L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001544
1545This program is free software published under the
1546L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1547
1548=cut