blob: 9b79b47a097592bdbfac03ed7a30962419716226 [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Nils Diewald2db9ad02013-10-29 19:26:43 +000012use Log::Log4perl;
Akron941c1a62016-02-23 17:41:41 +010013use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010014use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010015use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020016use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010017use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010018use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020019use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020020use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010021use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020022use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020023use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020024use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020025use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020026use Mojo::Collection 'c';
27use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020028use IO::File;
29use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020030use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010031
32# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010033# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010034# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010035
Akronc11f7982017-02-21 21:20:14 +010036# TODO: Use KorAP::XML::ForkPool!
37
Akron941c1a62016-02-23 17:41:41 +010038# CHANGES:
39# ----------------------------------------------------------
40# 2013/11/25
41# - Initial release
42#
43# 2014/10/29
44# - Merges foundry data to create indexer friendly documents
45#
Akron93d620e2016-02-05 19:40:05 +010046# 2016/02/04
47# - renamed to korapxml2krill
48# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010049#
50# 2016/02/12
51# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010052# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010053#
54# 2016/02/14
55# - Added version information
Akron941c1a62016-02-23 17:41:41 +010056# - Added support for archive files
57#
58# 2016/02/15
59# - Fixed temporary directory bug
60# - Improved skipping before unzipping
61# - Added EXPERIMENTAL concurrency support
62#
63# 2016/02/23
64# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010065#
66# 2016/02/27
67# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010068#
69# 2016/03/17
70# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010071#
72# 2016/03/18
73# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020074#
Akronf3f0c942016-06-27 13:27:14 +020075# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020076# - Added multi archive support
77# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020078# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020079#
80# 2016/07/06
81# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020082#
83# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020084# - Fixed temporary path issue in script
85#
86# 2016/10/24
87# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020088#
Akronb4bbec72016-10-26 20:21:02 +020089# 2016/10/24
90# - Added support for document extraction
91#
Akron3741f8b2016-12-21 19:55:21 +010092# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020093# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020094#
Akron3741f8b2016-12-21 19:55:21 +010095# 2016/12/21
96# - added support for base-sentences and base-tokenizations
97#
Akron4fa37c32017-01-20 14:43:10 +010098# 2017/01/20
99# - added support for DRuKoLa annotations
100#
Akron41ac10b2017-02-08 22:47:25 +0100101# 2017/02/08
102# - added support for pagebreak annotations
103#
Akron821db3d2017-04-06 21:19:31 +0200104# 2017/04/06
105# - added support for wildcards in input
106#
Akron636aa112017-04-07 18:48:56 +0200107# 2017/04/07
108# - support configuration option
Akron81500102017-04-07 20:45:44 +0200109# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200110#
Akron9ec88872017-04-12 16:29:06 +0200111# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200112# - support serial processing
113# - support input root
Akron9ec88872017-04-12 16:29:06 +0200114# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200115#
116# 2017/06/19
117# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200118#
119# 2017/06/29
120# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200121#
122# 2017/07/04
123# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100124#
125# 2018/01/16
126# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200127#
128# 2018/07/19
129# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100130#
131# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100132# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100133# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100134#
Akron63d03ee2019-02-13 18:49:38 +0100135# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100136# - Support for 'koral:field' array.
137# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100138# - Ignore temporary extract parameter on
139# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200140#
141# 2019/08/08
142# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100143#
Akronf1849aa2019-12-16 23:35:33 +0100144# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100145# - Added support for DGD pseudo-sentences
146# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100147# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200148#
149# 2020/04/23
150# - Added support for Redewiedergabe-Korpus structure
151# annotations, based on sentence and paragraph milestones
152# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100153# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100154
Akronf8df2162020-08-07 15:03:39 +0200155our $LAST_CHANGE = '2020/08/07';
Akron941c1a62016-02-23 17:41:41 +0100156our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100157our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100158our $VERSION_MSG = <<"VERSION";
159Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
160VERSION
161
Akron941c1a62016-02-23 17:41:41 +0100162# Parse comand
163my $cmd;
164our @ARGV;
165if ($ARGV[0] && index($ARGV[0], '-') != 0) {
166 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100167};
Akron63f20d42017-04-10 23:40:29 +0200168my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100169
Akron5f51d422016-08-16 16:26:43 +0200170my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200171
172# Configuration hash
173my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100174
Akron941c1a62016-02-23 17:41:41 +0100175# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000176GetOptions(
Akron08385f62016-03-22 20:37:04 +0100177 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200178 'input-base|ib=s' => \($cfg{input_base}),
179 'output|o=s' => \($cfg{output}),
180 'overwrite|w' => \($cfg{overwrite}),
181 'meta|m=s' => \($cfg{meta}),
182 'token|t=s' => \($cfg{token}),
183 'base-sentences|bs=s' => \($cfg{base_sentences}),
184 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
185 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
186 'gzip|z' => \($cfg{gzip}),
187 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100188 'skip|s=s' => \@skip,
189 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200190 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200191 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200192 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200193 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200194 'primary|p!' => sub {
195 warn 'Primary flag no longer supported!';
196 },
Akron6aed0562020-08-07 16:46:00 +0200197 'pretty|y' => sub {
198 warn 'Pretty flag no longer supported!';
199 },
Akronf8df2162020-08-07 15:03:39 +0200200 'jobs|j=i' => \($cfg{jobs}),
201 'koral|k=f' => \($cfg{koral}),
202 'to-tar' => \($cfg{to_tar}),
203 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
204 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
205 'sequential-extraction|se' => \($cfg{sequential_extraction}),
206 'cache-size|cs=s' => \($cfg{cache_size}),
207 'cache-delete|cd!' => \($cfg{cache_delete}),
208 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100209 'help|h' => sub {
210 pod2usage(
211 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200212 -verbose => 99,
213 -msg => $VERSION_MSG,
214 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100215 );
216 },
217 'version|v' => sub {
218 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200219 -verbose => 0,
220 -msg => $VERSION_MSG,
221 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100222 )
223 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000224);
225
Akrone512b7c2020-08-07 16:16:12 +0200226my %ERROR_HASH = (
227 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
228 -verbose => 99,
229 -msg => $VERSION_MSG,
230 -output => '-',
231 -exit => 1
232);
Akron63f20d42017-04-10 23:40:29 +0200233
Akronf8df2162020-08-07 15:03:39 +0200234# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200235if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200236 my %config;
237
Akronf8df2162020-08-07 15:03:39 +0200238 print "Reading config from $cfg_file\n";
239
Akron636aa112017-04-07 18:48:56 +0200240 Config::Simple->import_from($cfg_file, \%config);
241
Akronf8df2162020-08-07 15:03:39 +0200242 foreach (qw!output cache-size input-base token overwrite
243 meta base-sentences base-paragraphs base-pagebreaks
244 gzip to-tar log cache non-word-tokens
245 non-verbal-tokens sequential-extraction cache-init
246 koral extract-dir jobs!) {
247 my $underlined = $_ =~ tr/-/_/r;
248 if (!defined($cfg{$underlined}) && defined $config{$_}) {
249 $cfg{$underlined} = $config{$_};
250 };
Akron636aa112017-04-07 18:48:56 +0200251 };
252
253 # Skip
254 if (!scalar(@skip) && defined $config{'skip'}) {
255 @skip = split /\s*;\s*/, $config{'skip'} ;
256 };
257
258 # Sigle
259 if (!scalar(@sigle) && defined $config{'sigle'}) {
260 @sigle = split /\s*;\s*/, $config{'sigle'} ;
261 };
262
263 # Anno
264 if (!scalar(@anno) && defined $config{'anno'}) {
265 @anno = split /\s*;\s*/, $config{'anno'} ;
266 };
267};
268
Akronf8df2162020-08-07 15:03:39 +0200269# Init variables and set default values
270my $output = $cfg{output};
271my $input_base = $cfg{input_base};
272my $gzip = $cfg{gzip};
273my $to_tar = $cfg{to_tar};
274my $extract_dir = $cfg{extract_dir};
275my $token_base = $cfg{token} // 'OpenNLP#tokens';
276my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
277my $jobs = $cfg{jobs} // 0;
278my $cache_delete = $cfg{cache_delete} // 1;
279my $base_sentences = lc($cfg{base_sentences} // '');
280my $base_paragraphs = lc($cfg{base_paragraphs} // '');
281my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
282my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200283
Akronf8df2162020-08-07 15:03:39 +0200284# Get tokenization basis
285my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200286
Akronf8df2162020-08-07 15:03:39 +0200287# Remove file extension
288$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100289
Akronf8df2162020-08-07 15:03:39 +0200290# Convert sigle to path construct
291s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
292
293my %skip;
294$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200295
296# Initialize log4perl object
297Log::Log4perl->init({
Akronf8df2162020-08-07 15:03:39 +0200298 'log4perl.rootLogger' => uc($cfg{log} // 'ERROR') . ', STDERR',
Akron63f20d42017-04-10 23:40:29 +0200299 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
300 'log4perl.appender.STDERR.layout' => 'PatternLayout',
301 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
302});
303
304my $log = Log::Log4perl->get_logger('main');
305
Akronf8df2162020-08-07 15:03:39 +0200306if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
307 $log->error("Directory '$output' does not exist.");
308 exit 1;
309};
Akron63f20d42017-04-10 23:40:29 +0200310
Akron941c1a62016-02-23 17:41:41 +0100311# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100312pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000313
Akrone1dbc382016-07-08 22:24:52 +0200314# Gzip has no effect, if no output is given
315pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000316
Akronc11f7982017-02-21 21:20:14 +0100317
Akron63f20d42017-04-10 23:40:29 +0200318# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200319if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200320
Akron63f20d42017-04-10 23:40:29 +0200321 # Remove all inputs
322 my $remove_next = 0;
323 @keep_argv = @{c(@keep_argv)->grep(
324 sub {
325 # Input flag
326 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
327 $remove_next = 1;
328 return 0;
329 }
330
331 # input value
332 elsif ($remove_next) {
333 $remove_next = 0;
334 return 0;
335 };
336
337 # Pass parameter
338 return 1;
339 }
340 )->to_array};
341
342
343 # Iterate over all inputs
344 foreach (@input) {
345
Akron081639e2017-04-21 19:01:39 +0200346 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200347 my $new_out = catdir($output, get_file_name_from_glob($_));
348
Akron486f9ab2017-04-22 23:25:19 +0200349 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200350 unless ($to_tar) {
351 if (make_path($new_out) == 0 && !-d $new_out) {
352 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200353 exit 1;
Akron081639e2017-04-21 19:01:39 +0200354 };
Akron63f20d42017-04-10 23:40:29 +0200355 };
356
357 # Create archive command
358 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
359 print "Start serial processing of $_ to $new_out\n";
360
361 # Start archiving
362 system @archive_cmd;
363 };
364
Akron3abc03e2017-06-29 16:23:35 +0200365 exit;
Akron63f20d42017-04-10 23:40:29 +0200366};
367
Akron5c602cb2020-08-07 17:00:52 +0200368# Define supported (and preinstalled) transformation modules
369my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100370push(@layers, ['Base', 'Sentences']) unless $base_sentences;
371push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200372
373# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200374push(@layers, ['Connexor', 'Morpho'],
375 ['Connexor', 'Syntax'],
376 ['Connexor', 'Phrase'],
377 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200378
379# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200380push(@layers,
381 ['CoreNLP', 'NamedEntities'],
382 ['CoreNLP', 'Sentences'],
383 ['CoreNLP', 'Morpho'],
384 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200385
Akronce125b62017-06-19 11:54:36 +0200386# CMC
387push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100388
Akrone1dbc382016-07-08 22:24:52 +0200389# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100390my @dereko_attr = ();
391if ($base_sentences eq 'dereko#structure') {
392 push @dereko_attr, 'sentences';
393};
394if ($base_paragraphs eq 'dereko#structure') {
395 push @dereko_attr, 'paragraphs';
396};
Akron636bd9c2017-02-09 17:13:00 +0100397
Akron41ac10b2017-02-08 22:47:25 +0100398if ($base_pagebreaks eq 'dereko#structure') {
399 push @dereko_attr, 'pagebreaks';
400};
401
402if ($dereko_attr[0]) {
403 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100404}
405else {
406 push(@layers, ['DeReKo', 'Structure']);
407};
Akrone1dbc382016-07-08 22:24:52 +0200408
Akron57510c12019-01-04 14:58:53 +0100409# DGD
410push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100411if ($base_sentences eq 'dgd#structure') {
412 push(@layers, ['DGD', 'Structure', 'base-sentence']);
413}
Akron57510c12019-01-04 14:58:53 +0100414
415# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200416push(@layers,
417 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100418
Akrone1dbc382016-07-08 22:24:52 +0200419# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200420push(@layers,
421 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200422
Akronea1aed52018-07-19 14:43:34 +0200423# HNC
Akron5c602cb2020-08-07 17:00:52 +0200424push(@layers,
425 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200426
Akron4c679192018-01-16 17:41:49 +0100427# LWC
Akron5c602cb2020-08-07 17:00:52 +0200428push(@layers,
429 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100430
Akrone1dbc382016-07-08 22:24:52 +0200431# Malt
Akron5c602cb2020-08-07 17:00:52 +0200432push(@layers,
433 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200434
Akron57510c12019-01-04 14:58:53 +0100435# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200436push(@layers,
437 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200438
439# Mate
Akron5c602cb2020-08-07 17:00:52 +0200440push(@layers,
441 ['Mate', 'Morpho'],
442 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200443
Akron57510c12019-01-04 14:58:53 +0100444# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200445push(@layers,
446 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100447
Akrone1dbc382016-07-08 22:24:52 +0200448# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200449push(@layers,
450 ['OpenNLP', 'Morpho'],
451 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200452
Akron07e24772020-04-23 14:00:54 +0200453# Redewiedergabe
454push(@layers, ['RWK', 'Morpho']);
455if ($base_sentences eq 'rwk#structure') {
456 push(@layers, ['RWK', 'Structure']);
457};
458
Akrone1dbc382016-07-08 22:24:52 +0200459# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200460push(@layers,
461 ['Sgbr', 'Lemma'],
462 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200463
Akron7d5e6382019-08-08 16:36:27 +0200464# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200465push(@layers,
466 ['Talismane', 'Dependency'],
467 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200468
Akrone1dbc382016-07-08 22:24:52 +0200469# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200470push(@layers,
471 ['TreeTagger', 'Morpho'],
472 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200473
474# XIP
Akron5c602cb2020-08-07 17:00:52 +0200475push(@layers,
476 ['XIP', 'Morpho'],
477 ['XIP', 'Constituency'],
478 ['XIP', 'Sentences'],
479 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200480
Akron4fa37c32017-01-20 14:43:10 +0100481
Akrone1dbc382016-07-08 22:24:52 +0200482# Check filters
483my @filtered_anno;
484if ($skip{'#all'}) {
485 foreach (@anno) {
486 push @filtered_anno, [ split('#', $_) ];
487 };
488}
489
490# Add all annotations that are not skipped
491else {
492 # Add to index file - respect skipping
493 foreach my $info (@layers) {
494 # Skip if Foundry or Foundry#Layer should be skipped
495 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
496 push @filtered_anno, $info;
497 };
498 };
499};
500
Akrone1dbc382016-07-08 22:24:52 +0200501
502# TODO: This should not be initialized for batch
503my $cache = Cache::FastMmap->new(
504 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200505 cache_size => ($cfg{cache_size} // '50m'),
506 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200507);
508
Akron03b24db2016-08-16 20:54:32 +0200509# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200510my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200511 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200512 meta_type => $cfg{meta},
513 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200514 foundry => $token_base_foundry,
515 layer => $token_base_layer,
516 gzip => $gzip,
517 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200518 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100519 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200520 non_word_tokens => ($cfg{non_word_tokens} // 0),
521 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200522);
523
Akrone512b7c2020-08-07 16:16:12 +0200524
525# Auto adjust jobs
526if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100527 my $cores = 1;
528 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
529 $cores = Sys::Info->new->device('CPU')->count;
530 }
531 else {
532 $log->warn("Unable to determine number of cores");
533 };
534
Akrone512b7c2020-08-07 16:16:12 +0200535 $jobs = ceil(5 * $cores);
536 $log->info("Run using $jobs jobs on $cores cores");
537};
538
539
Akron63f20d42017-04-10 23:40:29 +0200540# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200541if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200542
Akron821db3d2017-04-06 21:19:31 +0200543 my @new_input = ();
544
545 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200546 foreach my $wild_card (@input) {
547
548 # Prefix with input root
549 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
550
551 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200552 };
553
Akron63f20d42017-04-10 23:40:29 +0200554 # Sort files by length
555 @input = sort { length($a) <=> length($b) } @new_input;
556
557 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200558};
559
560
Akron941c1a62016-02-23 17:41:41 +0100561# Process a single file
562unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100563 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000564
Akron941c1a62016-02-23 17:41:41 +0100565 BEGIN {
566 $main::TIME = Benchmark->new;
567 $main::LAST_STOP = Benchmark->new;
568 };
569
570 sub stop_time {
571 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200572 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100573 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200574 timestr(timediff($new, $main::LAST_STOP)) .
575 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
576 );
Akron941c1a62016-02-23 17:41:41 +0100577 $main::LAST_STOP = $new;
578 };
579
580 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200581 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100582
Akron7d4cdd82016-08-17 21:39:45 +0200583 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200584 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100585
Akron11c80302016-03-18 19:44:43 +0100586 # Delete cache file
587 unlink($cache_file) if $cache_delete;
588
Akron5f51d422016-08-16 16:26:43 +0200589 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200590 exit;
Akron81500102017-04-07 20:45:44 +0200591};
592
Nils Diewald59094f22014-11-05 18:20:50 +0000593
Akrone10ad322016-02-27 10:54:26 +0100594# Extract XML files
Akron81500102017-04-07 20:45:44 +0200595if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100596
Akrond5643ad2017-07-04 20:27:13 +0200597 # Output is required
598 pod2usage(%ERROR_HASH) unless $output;
599
Akron7d4cdd82016-08-17 21:39:45 +0200600 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200601 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100602
Akron7d4cdd82016-08-17 21:39:45 +0200603 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100604 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200605 $log->error("Unzip is not installed or incompatible.");
606 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100607 };
608
Akronb0c88db2016-06-29 16:33:18 +0200609 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200610 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200611
Akron31a08cb2019-02-20 20:43:26 +0100612 # Will set @sigle
613 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200614
Akrone10ad322016-02-27 10:54:26 +0100615 # Iterate over all given sigles and extract
616 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100617
Akron2812ba22016-10-28 21:55:59 +0200618 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200619
Akron03b24db2016-08-16 20:54:32 +0200620 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200621 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100622
Akron955b75b2019-02-21 14:28:41 +0100623 # TODO:
624 # - prefix???
625 $archive->extract_sigle([$_], $output, $jobs)
626 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200627 );
Akrone10ad322016-02-27 10:54:26 +0100628 print "extracted.\n";
629 };
Akronb0c88db2016-06-29 16:33:18 +0200630 }
Akron7d4cdd82016-08-17 21:39:45 +0200631
632 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200633 else {
634 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200635 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100636 };
637}
638
Akron81500102017-04-07 20:45:44 +0200639
Akron941c1a62016-02-23 17:41:41 +0100640# Process an archive
641elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000642
Akron81500102017-04-07 20:45:44 +0200643 my $archive_output;
644
645 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100646 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200647
648 # Create new archive object
649 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
650
651 # Check zip capabilities
652 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200653 $log->error("Unzip is not installed or incompatible.");
654 exit 1;
Akron81500102017-04-07 20:45:44 +0200655 };
656
657 # Add further annotation archived
658 $archive->attach($_) foreach @input[1..$#input];
659
660 # Create a temporary directory
661 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200662 $extract_dir = tempdir(CLEANUP => 0);
663 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200664 };
665
Akron63f20d42017-04-10 23:40:29 +0200666 # Add some random extra to avoid clashes with multiple archives
667 $extract_dir = catdir($extract_dir, random_string('cccccc'));
668
Akron31a08cb2019-02-20 20:43:26 +0100669 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200670 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200671 @input = ($extract_dir);
672 }
673 else {
674 $log->error('Unable to extract from primary archive ' . $input[0] .
675 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200676 exit 1;
Akron81500102017-04-07 20:45:44 +0200677 };
678 }
679
680 # Can't create archive object
681 else {
682 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200683 exit 1;
Akron81500102017-04-07 20:45:44 +0200684 };
685 };
686
Akron7d4cdd82016-08-17 21:39:45 +0200687 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100688 my $pool = Parallel::ForkManager->new($jobs);
689
Akron7d4cdd82016-08-17 21:39:45 +0200690 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100691 my $iter = 1; # Current text in process
692
Akronda3097e2017-04-23 19:53:57 +0200693 my $tar_archive;
694 my $output_dir = $output;
695 my $tar_fh;
696
697 # Initialize tar archive
698 if ($to_tar) {
699 $tar_archive = Archive::Tar::Builder->new(
700 ignore_errors => 1
701 );
702
703 # Set output name
704 my $tar_file = $output;
705 unless ($tar_file =~ /\.tar$/) {
706 $tar_file .= '.tar';
707 };
708
709 # Initiate the tar file
710 print "Writing to file $tar_file\n";
711 $tar_fh = IO::File->new($tar_file, 'w');
712 $tar_fh->binmode(1);
713
714 # Set handle
715 $tar_archive->set_handle($tar_fh);
716
717 # Output to temporary directory
718 $output_dir = File::Temp->newdir;
719 };
720
Akron941c1a62016-02-23 17:41:41 +0100721 # Report on fork message
722 $pool->run_on_finish (
723 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200724 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100725 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200726
Akron08385f62016-03-22 20:37:04 +0100727 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200728 ($iter++) . "/$count]" .
729 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200730 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200731
732 if (!$code && $to_tar && $data->[2]) {
733 my $filename = $data->[2];
734
735 # Lock filehandle
736 if (flock($tar_fh, LOCK_EX)) {
737
Akron9a062ce2017-07-04 19:12:05 +0200738 my $clean_file = fileparse($filename);
739
Akronda3097e2017-04-23 19:53:57 +0200740 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200741 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200742 unlink $filename;
743
744 # Unlock filehandle
745 flock($tar_fh, LOCK_UN);
746 }
747 else {
748 $log->warn("Unable to add $filename to archive");
749 };
750 };
751
Akron4c0cf312016-10-15 16:42:09 +0200752 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100753 }
754 );
755
756 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200757 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100758 print "Reading data ...\n";
759
Akron7d4cdd82016-08-17 21:39:45 +0200760 # unless (Cache::FastMmap->new(
761 # share_file => $cache_file,
762 # cache_size => $cache_size,
763 # init_file => $cache_init
764 # )) {
765 # print "Unable to intialize cache '$cache_file'\n\n";
766 # exit(1);
767 # };
Akron11c80302016-03-18 19:44:43 +0100768
Akron486f9ab2017-04-22 23:25:19 +0200769
Akron941c1a62016-02-23 17:41:41 +0100770 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100771 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200772 # TODO:
773 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100774 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100775 my @dirs;
776 my $dir;
777
Akron7d4cdd82016-08-17 21:39:45 +0200778 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100779 while (1) {
780 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200781 push @dirs, $dir;
782 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100783 };
784 last unless $it->next;
785 };
786
787 print "Start processing ...\n";
788 $t = Benchmark->new;
789 $count = scalar @dirs;
790
791 DIRECTORY_LOOP:
792 for (my $i = 0; $i < $count; $i++) {
793
Akrone1dbc382016-07-08 22:24:52 +0200794 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200795 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200796 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200797 );
Akron941c1a62016-02-23 17:41:41 +0100798
799 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200800 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200801
Akron13d56622016-10-31 14:54:49 +0100802 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200803 $pool->finish(
804 0,
Akronda3097e2017-04-23 19:53:57 +0200805 [
806 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
807 undef,
808 $filename
809 ]
Akron486f9ab2017-04-22 23:25:19 +0200810 );
Akron3ec48972016-08-17 23:24:52 +0200811 }
812 else {
Akron4c0cf312016-10-15 16:42:09 +0200813 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200814 };
Akron941c1a62016-02-23 17:41:41 +0100815 };
816 }
817
818 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200819 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200820
Akron941c1a62016-02-23 17:41:41 +0100821 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200822 $log->error("Unzip is not installed or incompatible.");
823 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100824 };
825
Akron08385f62016-03-22 20:37:04 +0100826 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200827 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100828
Akron31a08cb2019-02-20 20:43:26 +0100829 # Get sigles to extract
830 my $prefix = set_sigle($archive);
831
Akron941c1a62016-02-23 17:41:41 +0100832 print "Start processing ...\n";
833 $t = Benchmark->new;
834 my @dirs = $archive->list_texts;
835 $count = scalar @dirs;
836
837 ARCHIVE_LOOP:
838 for (my $i = 0; $i < $count; $i++) {
839
840 # Split path information
841 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
842
Akrone1dbc382016-07-08 22:24:52 +0200843 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200844 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200845 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200846 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200847 catfile($corpus, $doc, $text)
848 . '.json' . ($gzip ? '.gz' : '')
849 )
Akrone1dbc382016-07-08 22:24:52 +0200850 );
Akron941c1a62016-02-23 17:41:41 +0100851
852 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200853 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100854
Akron4c0cf312016-10-15 16:42:09 +0200855 # Create temporary file
856 $temp = File::Temp->newdir;
857
Akronbdf434a2016-10-24 17:42:07 +0200858 # TODO: Check if $filename exist at the beginning,
859 # because extraction can be horrible slow!
860
Akron941c1a62016-02-23 17:41:41 +0100861 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100862 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100863
Akron7d4cdd82016-08-17 21:39:45 +0200864 # Create corpus directory
865 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100866
Akron7d4cdd82016-08-17 21:39:45 +0200867 # Temporary directory
868 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100869
Akron7d4cdd82016-08-17 21:39:45 +0200870 # Write file
Akron13d56622016-10-31 14:54:49 +0100871 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200872
Akron4c0cf312016-10-15 16:42:09 +0200873 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100874 $pool->finish(
875 0,
Akronda3097e2017-04-23 19:53:57 +0200876 [
877 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
878 $temp,
879 $filename
880 ]
Akron13d56622016-10-31 14:54:49 +0100881 );
Akron7d4cdd82016-08-17 21:39:45 +0200882 }
883 else {
Akron4c0cf312016-10-15 16:42:09 +0200884 # Delete temporary file
885 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200886 };
Akron941c1a62016-02-23 17:41:41 +0100887 }
Akron7d4cdd82016-08-17 21:39:45 +0200888
889 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100890 else {
Akron4c0cf312016-10-15 16:42:09 +0200891 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100892 };
893 };
894 }
895
896 else {
897 print "Input is neither a directory nor an archive.\n\n";
898 };
899
900 $pool->wait_all_children;
901
Akron11c80302016-03-18 19:44:43 +0100902 # Delete cache file
903 unlink($cache_file) if $cache_delete;
904
Akronda3097e2017-04-23 19:53:57 +0200905 # Close tar filehandle
906 if ($to_tar && $tar_fh) {
907 $tar_archive->finish;
908 $tar_fh->close;
909 print "Wrote to tar archive.\n";
910 };
911
Akron63f20d42017-04-10 23:40:29 +0200912 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100913 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200914};
Akron941c1a62016-02-23 17:41:41 +0100915
Nils Diewald2db9ad02013-10-29 19:26:43 +0000916
Akron31a08cb2019-02-20 20:43:26 +0100917# For an archive, this will create the list
918# of all sigles to process
919sub set_sigle {
920 my $archive = shift;
921
922 my $prefix = 1;
923 my @dirs = ();
924
925 # No sigles given
926 unless (@sigle) {
927
928 # Get files
929 foreach ($archive->list_texts) {
930
931 push @dirs, $_;
932
933 # Split path information
934 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
935
936 # TODO: Make this OS independent
937 push @sigle, join '/', $corpus, $doc, $text;
938 };
939 }
940
941 # Check sigle for doc sigles
942 else {
943 my @new_sigle;
944
945 my $prefix_check = 0;
946
947 # Iterate over all sigle
948 foreach (@sigle) {
949
950 # Sigle is a doc sigle
951 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
952
953 print "$_ ...";
954 # Check if a prefix is needed
955 unless ($prefix_check) {
956
957 if ($prefix = $archive->check_prefix) {
958 print " with prefix ...";
959 };
960 $prefix_check = 1;
961 };
962
963 print "\n";
964
Akron31a08cb2019-02-20 20:43:26 +0100965 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100966 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
967 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100968 );
969 print "extracted.\n";
970 }
971
972 # Sigle is a text sigle
973 else {
974 push @new_sigle, $_;
975
976 unless ($prefix_check) {
977
978 if ($prefix = $archive->check_prefix) {
979 print " with prefix ...";
980 };
981 $prefix_check = 1;
982 };
983 };
984 };
985 @sigle = @new_sigle;
986 };
987
988 return $prefix;
989};
990
991
Akron63f20d42017-04-10 23:40:29 +0200992# Cleanup temporary extraction directory
993if ($extract_dir) {
994 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +0200995 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +0200996};
997
998
999print "\n";
1000
Nils Diewald2db9ad02013-10-29 19:26:43 +00001001__END__
Akron941c1a62016-02-23 17:41:41 +01001002
1003=pod
1004
1005=encoding utf8
1006
1007=head1 NAME
1008
Akron42f48c12020-02-14 13:08:13 +01001009korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001010
1011
1012=head1 SYNOPSIS
1013
Akrona76d8352016-10-27 16:27:32 +02001014 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001015
Akron2fd402b2016-10-27 21:26:48 +02001016
Akron941c1a62016-02-23 17:41:41 +01001017=head1 DESCRIPTION
1018
1019L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1020compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001021The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001022
1023
1024=head1 INSTALLATION
1025
1026The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1027
Akronaf386982016-10-12 00:33:25 +02001028 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001029
Akronc13a1702016-03-15 19:33:14 +01001030In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001031be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001032Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akron0b04b312020-10-30 17:39:18 +01001033Optional support for L<Sys::Info> to calculate available cores.
Akrona93d51b2016-10-24 20:27:48 +02001034In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001035
1036=head1 ARGUMENTS
1037
Akrona76d8352016-10-27 16:27:32 +02001038 $ korapxml2krill -z --input <directory> --output <filename>
1039
1040Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001041It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001042
Akron941c1a62016-02-23 17:41:41 +01001043=over 2
1044
1045=item B<archive>
1046
Akron081639e2017-04-21 19:01:39 +02001047 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001048
Akron2fd402b2016-10-27 21:26:48 +02001049Converts an archive of KorAP-XML documents. It expects a directory
1050(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001051
1052=item B<extract>
1053
Akrona76d8352016-10-27 16:27:32 +02001054 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1055
1056Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001057
Akron63f20d42017-04-10 23:40:29 +02001058=item B<serial>
1059
1060 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1061
1062Convert archives sequentially. The inputs are not merged but treated
1063as they are (so they may be premerged or globs).
1064the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001065are created based on the archive name. In case the C<--to-tar> flag is given,
1066the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001067
1068
Akron941c1a62016-02-23 17:41:41 +01001069=back
1070
1071
1072=head1 OPTIONS
1073
1074=over 2
1075
Akrona76d8352016-10-27 16:27:32 +02001076=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001077
Akrona76d8352016-10-27 16:27:32 +02001078Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001079
Akron7606afa2016-10-25 16:23:49 +02001080Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001081document, while C<archive> expects a KorAP-XML corpus folder or a zip
1082file to batch process multiple files.
1083C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001084
Akrona76d8352016-10-27 16:27:32 +02001085C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001086that the first archive listed contains all primary data files
1087and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001088
Akron7606afa2016-10-25 16:23:49 +02001089 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001090
Akron821db3d2017-04-06 21:19:31 +02001091Input may also be defined using BSD glob wildcards.
1092
1093 -i 'file/news*.zip'
1094
1095The extended input array will be sorted in length order, so the shortest
1096path needs to contain all primary data files and all meta data files.
1097
Akron0c3e3752016-06-28 15:55:53 +02001098(The directory structure follows the base directory format,
1099that may include a C<.> root folder.
1100In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001101need to be passed with a hash sign in front of the archive's name.
1102This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001103
Akron7606afa2016-10-25 16:23:49 +02001104To support zip files, a version of C<unzip> needs to be installed that is
1105compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001106
Akron7606afa2016-10-25 16:23:49 +02001107B<The root folder switch using the hash sign is experimental and
1108may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001109
Akronf73ffb62018-06-27 12:13:59 +02001110
Akron63f20d42017-04-10 23:40:29 +02001111=item B<--input-base|-ib> <directory>
1112
1113The base directory for inputs.
1114
1115
Akron941c1a62016-02-23 17:41:41 +01001116=item B<--output|-o> <directory|file>
1117
1118Output folder for archive processing or
1119document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001120writes to C<STDOUT> by default
1121(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001122
1123=item B<--overwrite|-w>
1124
1125Overwrite files that already exist.
1126
Akronf73ffb62018-06-27 12:13:59 +02001127
Akron3741f8b2016-12-21 19:55:21 +01001128=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001129
1130Define the default tokenization by specifying
1131the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001132of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001133This will directly take the file instead of running
1134the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001135
Akron3741f8b2016-12-21 19:55:21 +01001136
1137=item B<--base-sentences|-bs> <foundry>#<layer>
1138
1139Define the layer for base sentences.
1140If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001141Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1142layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001143
1144 Defaults to unset.
1145
1146
1147=item B<--base-paragraphs|-bp> <foundry>#<layer>
1148
1149Define the layer for base paragraphs.
1150If given, this will be used instead of using C<Base#Paragraphs>.
1151Currently C<DeReKo#Structure> is the only additional layer supported.
1152
1153 Defaults to unset.
1154
1155
Akron41ac10b2017-02-08 22:47:25 +01001156=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1157
1158Define the layer for base pagebreaks.
1159Currently C<DeReKo#Structure> is the only layer supported.
1160
1161 Defaults to unset.
1162
1163
Akron941c1a62016-02-23 17:41:41 +01001164=item B<--skip|-s> <foundry>[#<layer>]
1165
Akronf7ad89e2016-03-16 18:22:47 +01001166Skip specific annotations by specifying the foundry
1167(and optionally the layer with a C<#>-prefix),
1168e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001169Can be set multiple times.
1170
Akronf73ffb62018-06-27 12:13:59 +02001171
Akronc13a1702016-03-15 19:33:14 +01001172=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001173
Akronf7ad89e2016-03-16 18:22:47 +01001174Convert specific annotations by specifying the foundry
1175(and optionally the layer with a C<#>-prefix),
1176e.g. C<Mate> or C<Mate#Morpho>.
1177Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001178
Akronf73ffb62018-06-27 12:13:59 +02001179
Akroned9baf02019-01-22 17:03:25 +01001180=item B<--non-word-tokens|-nwt>
1181
1182Tokenize non-word tokens like word tokens (defined as matching
1183C</[\d\w]/>). Useful to treat punctuations as tokens.
1184
1185 Defaults to unset.
1186
Akronf1849aa2019-12-16 23:35:33 +01001187
1188=item B<--non-verbal-tokens|-nvt>
1189
1190Tokenize non-verbal tokens marked as in the primary data as
1191the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1192
1193 Defaults to unset.
1194
1195
Akron941c1a62016-02-23 17:41:41 +01001196=item B<--jobs|-j>
1197
1198Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001199for archive processing.
Akron11c80302016-03-18 19:44:43 +01001200Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001201
1202If C<sequential-extraction> is not set to false, this will
1203also apply to extraction.
1204
Akronc11f7982017-02-21 21:20:14 +01001205Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001206times the number of available cores, in case L<Sys::Info>
1207is available.
Akronf7ad89e2016-03-16 18:22:47 +01001208This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001209
Akronf73ffb62018-06-27 12:13:59 +02001210
Akron263274c2019-02-07 09:48:30 +01001211=item B<--koral|-k>
1212
1213Version of the output format. Supported versions are:
1214C<0> for legacy serialization, C<0.03> for serialization
1215with metadata fields as key-values on the root object,
1216C<0.4> for serialization with metadata fields as a list
1217of C<"@type":"koral:field"> objects.
1218
1219Currently defaults to C<0.03>.
1220
1221
Akron9ec88872017-04-12 16:29:06 +02001222=item B<--sequential-extraction|-se>
1223
1224Flag to indicate, if the C<jobs> value also applies to extraction.
1225Some systems may have problems with extracting multiple archives
1226to the same folder at the same time.
1227Can be flagged using C<--no-sequential-extraction> as well.
1228Defaults to C<false>.
1229
Akronf73ffb62018-06-27 12:13:59 +02001230
Akron35db6e32016-03-17 22:42:22 +01001231=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001232
Akron35db6e32016-03-17 22:42:22 +01001233Define the metadata parser to use. Defaults to C<I5>.
1234Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1235This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001236
Akronf73ffb62018-06-27 12:13:59 +02001237
Akron941c1a62016-02-23 17:41:41 +01001238=item B<--gzip|-z>
1239
Akronf7ad89e2016-03-16 18:22:47 +01001240Compress the output.
1241Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001242
Akronf73ffb62018-06-27 12:13:59 +02001243
Akron11c80302016-03-18 19:44:43 +01001244=item B<--cache|-c>
1245
1246File to mmap a cache (using L<Cache::FastMmap>).
1247Defaults to C<korapxml2krill.cache> in the calling directory.
1248
Akronf73ffb62018-06-27 12:13:59 +02001249
Akron11c80302016-03-18 19:44:43 +01001250=item B<--cache-size|-cs>
1251
1252Size of the cache. Defaults to C<50m>.
1253
Akronf73ffb62018-06-27 12:13:59 +02001254
Akron11c80302016-03-18 19:44:43 +01001255=item B<--cache-init|-ci>
1256
1257Initialize cache file.
1258Can be flagged using C<--no-cache-init> as well.
1259Defaults to C<true>.
1260
Akronf73ffb62018-06-27 12:13:59 +02001261
Akron11c80302016-03-18 19:44:43 +01001262=item B<--cache-delete|-cd>
1263
1264Delete cache file after processing.
1265Can be flagged using C<--no-cache-delete> as well.
1266Defaults to C<true>.
1267
Akronf73ffb62018-06-27 12:13:59 +02001268
Akron636aa112017-04-07 18:48:56 +02001269=item B<--config|-cfg>
1270
1271Configure the parameters of your call in a file
1272of key-value pairs with whitespace separator
1273
1274 overwrite 1
1275 token DeReKo#Structure
1276 ...
1277
1278Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001279C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001280C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001281C<output>, C<koral>,
1282C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001283C<base-sentences>, C<base-paragraphs>,
1284C<base-pagebreaks>,
1285C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001286(semicolon separated), C<anno> (semicolon separated).
1287
Akronf73ffb62018-06-27 12:13:59 +02001288Configuration parameters will always be overwritten by
1289passed parameters.
1290
1291
Akron81500102017-04-07 20:45:44 +02001292=item B<--temporary-extract|-te>
1293
1294Only valid for the C<archive> command.
1295
1296This will first extract all files into a
1297directory and then will archive.
1298If the directory is given as C<:temp:>,
1299a temporary directory is used.
1300This is especially useful to avoid
1301massive unzipping and potential
1302network latency.
Akron636aa112017-04-07 18:48:56 +02001303
Akronf73ffb62018-06-27 12:13:59 +02001304
Akronc93a0802019-07-11 15:48:34 +02001305=item B<--to-tar>
1306
1307Only valid for the C<archive> command.
1308
1309Writes the output into a tar archive.
1310
1311
Akrone10ad322016-02-27 10:54:26 +01001312=item B<--sigle|-sg>
1313
Akron20807582016-10-26 17:11:34 +02001314Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001315Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001316I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001317Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001318In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001319On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001320
Akronf73ffb62018-06-27 12:13:59 +02001321
Akron941c1a62016-02-23 17:41:41 +01001322=item B<--log|-l>
1323
1324The L<Log4perl> log level, defaults to C<ERROR>.
1325
Akronf73ffb62018-06-27 12:13:59 +02001326
Akron941c1a62016-02-23 17:41:41 +01001327=item B<--help|-h>
1328
Akron42f48c12020-02-14 13:08:13 +01001329Print help information.
Akron941c1a62016-02-23 17:41:41 +01001330
Akronf73ffb62018-06-27 12:13:59 +02001331
Akron941c1a62016-02-23 17:41:41 +01001332=item B<--version|-v>
1333
1334Print version information.
1335
1336=back
1337
Akronf73ffb62018-06-27 12:13:59 +02001338
Akronc13a1702016-03-15 19:33:14 +01001339=head1 ANNOTATION SUPPORT
1340
1341L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1342developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1343The base foundry with paragraphs, sentences, and the text element are mandatory for
1344L<Krill|https://github.com/KorAP/Krill>.
1345
Akron821db3d2017-04-06 21:19:31 +02001346 Base
1347 #Paragraphs
1348 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001349
Akron821db3d2017-04-06 21:19:31 +02001350 Connexor
1351 #Morpho
1352 #Phrase
1353 #Sentences
1354 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001355
Akron821db3d2017-04-06 21:19:31 +02001356 CoreNLP
1357 #Constituency
1358 #Morpho
1359 #NamedEntities
1360 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001361
Akronce125b62017-06-19 11:54:36 +02001362 CMC
1363 #Morpho
1364
Akron821db3d2017-04-06 21:19:31 +02001365 DeReKo
1366 #Structure
Akronc13a1702016-03-15 19:33:14 +01001367
Akron57510c12019-01-04 14:58:53 +01001368 DGD
1369 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001370 #Structure
Akron57510c12019-01-04 14:58:53 +01001371
Akron821db3d2017-04-06 21:19:31 +02001372 DRuKoLa
1373 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001374
Akron821db3d2017-04-06 21:19:31 +02001375 Glemm
1376 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001377
Akronea1aed52018-07-19 14:43:34 +02001378 HNC
1379 #Morpho
1380
Akron4c679192018-01-16 17:41:49 +01001381 LWC
1382 #Dependency
1383
Akron821db3d2017-04-06 21:19:31 +02001384 Malt
1385 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001386
Akron821db3d2017-04-06 21:19:31 +02001387 MarMoT
1388 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001389
Akron821db3d2017-04-06 21:19:31 +02001390 Mate
1391 #Dependency
1392 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001393
Akron821db3d2017-04-06 21:19:31 +02001394 MDParser
1395 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001396
Akron821db3d2017-04-06 21:19:31 +02001397 OpenNLP
1398 #Morpho
1399 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001400
Akron07e24772020-04-23 14:00:54 +02001401 RWK
1402 #Morpho
1403 #Structure
1404
Akron821db3d2017-04-06 21:19:31 +02001405 Sgbr
1406 #Lemma
1407 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001408
Akron7d5e6382019-08-08 16:36:27 +02001409 Talismane
1410 #Dependency
1411 #Morpho
1412
Akron821db3d2017-04-06 21:19:31 +02001413 TreeTagger
1414 #Morpho
1415 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001416
Akron821db3d2017-04-06 21:19:31 +02001417 XIP
1418 #Constituency
1419 #Morpho
1420 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001421
Akronc13a1702016-03-15 19:33:14 +01001422
1423More importers are in preparation.
1424New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1425See the built-in annotation importers as examples.
1426
Akronf73ffb62018-06-27 12:13:59 +02001427
Akron8f69d632020-01-15 16:58:11 +01001428=head1 About KorAP-XML
1429
1430KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1431data model (Bański et al. 2013), where text data are stored physically
1432separated from their interpretations (i.e. annotations).
1433A text document in KorAP-XML therefore consists of several files
1434containing primary data, metadata and annotations.
1435
1436The structure of a single KorAP-XML document can be as follows:
1437
1438 - data.xml
1439 - header.xml
1440 + base
1441 - tokens.xml
1442 - ...
1443 + struct
1444 - structure.xml
1445 - ...
1446 + corenlp
1447 - morpho.xml
1448 - constituency.xml
1449 - ...
1450 + tree_tagger
1451 - morpho.xml
1452 - ...
1453 - ...
1454
1455The C<data.xml> contains the primary data, the C<header.xml> contains
1456the metadata, and the annotation layers are stored in subfolders
1457like C<base>, C<struct> or C<corenlp>
1458(so-called "foundries"; Bański et al. 2013).
1459
1460Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001461(Lüngen and Sperberg-McQueen 2012). See the documentation in
1462L<KorAP::XML::Meta::I5> for translatable fields.
1463
1464Annotations correspond to a variant of the TEI-P5 feature structures
1465(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001466Annotation feature structures refer to character sequences of the primary text
1467inside the C<text> element of the C<data.xml>.
1468A single annotation containing the lemma of a token can have the following structure:
1469
1470 <span from="0" to="3">
1471 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1472 <f name="lex">
1473 <fs>
1474 <f name="lemma">zum</f>
1475 </fs>
1476 </f>
1477 </fs>
1478 </span>
1479
1480The C<from> and C<to> attributes are refering to the character span
1481in the primary text.
1482Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1483the structure may vary. See L<KorAP::XML::Annotation::*> for various
1484annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001485
1486Multiple KorAP-XML documents are organized on three levels following
1487the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1488corpus E<gt> document E<gt> text. On each level metadata information
1489can be stored, that C<korapxml2krill> will merge to a single metadata
1490object per text. A corpus is therefore structured as follows:
1491
1492 + <corpus>
1493 - header.xml
1494 + <document>
1495 - header.xml
1496 + <text>
1497 - data.xml
1498 - header.xml
1499 - ...
1500 - ...
1501
1502A single text can be identified by the concatenation of
1503the corpus identifier, the document identifier and the text identifier.
1504This identifier is called the text sigle
1505(e.g. a text with the identifier C<18486> in the document C<060> in the
1506corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1507
1508These corpora are often stored in zip files, with which C<korapxml2krill>
1509can deal with. Corpora may also be split in multiple zip archives
1510(e.g. one zip file per foundry), which is also supported (see C<--input>).
1511
1512Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1513in form of a test suite.
1514The resulting JSON format merges all annotation layers
1515based on a single token stream.
1516
1517=head2 References
1518
1519Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1520KorAP data model: first approximation, December.
1521
1522Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1523"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1524Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1525L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1526
1527Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1528"Robust corpus architecture: a new look at virtual collections and data access",
1529Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1530L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1531
1532Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1533Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1534"Towards an international standard on featurestructure representation",
1535Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1536pp. 373-376.
1537L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1538
1539Harald Lüngen and C. M. Sperberg-McQueen (2012):
1540"A TEI P5 Document Grammar for the IDS Text Model",
1541Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1542L<PDF|https://journals.openedition.org/jtei/pdf/508>
1543
1544TEI Consortium, eds:
1545"Feature Structures",
1546Guidelines for Electronic Text Encoding and Interchange.
1547L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1548
Akron941c1a62016-02-23 17:41:41 +01001549=head1 AVAILABILITY
1550
1551 https://github.com/KorAP/KorAP-XML-Krill
1552
1553
1554=head1 COPYRIGHT AND LICENSE
1555
Akron8f69d632020-01-15 16:58:11 +01001556Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001557
Akron8f69d632020-01-15 16:58:11 +01001558Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001559
Akrona76d8352016-10-27 16:27:32 +02001560Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001561
1562L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1563Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001564L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001565member of the
Akronf1849aa2019-12-16 23:35:33 +01001566L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001567
1568This program is free software published under the
1569L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1570
1571=cut