blob: 5e020df799c821ad2b28d0cba7ab2e07a661189a [file] [log] [blame]
Nils Diewald2db9ad02013-10-29 19:26:43 +00001#!/usr/bin/env perl
2use strict;
3use warnings;
Akronf8df2162020-08-07 15:03:39 +02004use v5.10;
Akron941c1a62016-02-23 17:41:41 +01005use FindBin;
6BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
7use File::Spec::Functions qw/catfile catdir/;
8use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald7364d1f2013-11-05 19:26:35 +00009use Benchmark qw/:hireswallclock/;
10use IO::Compress::Gzip qw/$GzipError/;
Akronc11f7982017-02-21 21:20:14 +010011use POSIX qw/ceil/;
Akronb9c33812020-10-21 16:19:35 +020012use Log::Any qw($log);
13use Log::Any::Adapter;
Akron941c1a62016-02-23 17:41:41 +010014use Pod::Usage;
Akron11c80302016-03-18 19:44:43 +010015use Cache::FastMmap;
Akron941c1a62016-02-23 17:41:41 +010016use Directory::Iterator;
Akron41127e32020-08-07 12:46:19 +020017use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron941c1a62016-02-23 17:41:41 +010018use KorAP::XML::Archive;
Akron93d620e2016-02-05 19:40:05 +010019use KorAP::XML::Tokenizer;
Akrone1dbc382016-07-08 22:24:52 +020020use KorAP::XML::Batch::File;
Akron636aa112017-04-07 18:48:56 +020021use Config::Simple;
Akron941c1a62016-02-23 17:41:41 +010022use Parallel::ForkManager;
Akron821db3d2017-04-06 21:19:31 +020023use File::Glob ':bsd_glob';
Akron81500102017-04-07 20:45:44 +020024use File::Temp qw/tempdir/;
Akron63f20d42017-04-10 23:40:29 +020025use File::Path qw(remove_tree make_path);
Akron9a062ce2017-07-04 19:12:05 +020026use File::Basename;
Akron63f20d42017-04-10 23:40:29 +020027use Mojo::Collection 'c';
28use String::Random qw(random_string);
Akron081639e2017-04-21 19:01:39 +020029use IO::File;
30use Archive::Tar::Builder;
Akronda3097e2017-04-23 19:53:57 +020031use Fcntl qw(:flock SEEK_END);
Akronc11f7982017-02-21 21:20:14 +010032
33# use KorAP::XML::ForkPool;
Akron75ba57d2016-03-07 23:36:27 +010034# TODO: use Parallel::Loops
Akron08385f62016-03-22 20:37:04 +010035# TODO: make output files
Akron93d620e2016-02-05 19:40:05 +010036
Akronc11f7982017-02-21 21:20:14 +010037# TODO: Use KorAP::XML::ForkPool!
38
Akron941c1a62016-02-23 17:41:41 +010039# CHANGES:
40# ----------------------------------------------------------
41# 2013/11/25
42# - Initial release
43#
44# 2014/10/29
45# - Merges foundry data to create indexer friendly documents
46#
Akron93d620e2016-02-05 19:40:05 +010047# 2016/02/04
48# - renamed to korapxml2krill
49# - added Schreibgebrauch support
Akron069bd712016-02-12 19:09:06 +010050#
51# 2016/02/12
52# - fixed foundry skipping
Akron941c1a62016-02-23 17:41:41 +010053# - Support overwrite in archive processing
Akron150b29e2016-02-14 23:06:48 +010054#
55# 2016/02/14
56# - Added version information
Akron941c1a62016-02-23 17:41:41 +010057# - Added support for archive files
58#
59# 2016/02/15
60# - Fixed temporary directory bug
61# - Improved skipping before unzipping
62# - Added EXPERIMENTAL concurrency support
63#
64# 2016/02/23
65# - Merge korapxml2krill and korapxml2krill_dir
Akrone10ad322016-02-27 10:54:26 +010066#
67# 2016/02/27
68# - Added extract function
Akron35db6e32016-03-17 22:42:22 +010069#
70# 2016/03/17
71# - Added meta switch
Akron11c80302016-03-18 19:44:43 +010072#
73# 2016/03/18
74# - Added meta data caching
Akron2cfe8092016-06-24 17:48:49 +020075#
Akronf3f0c942016-06-27 13:27:14 +020076# 2016/06/27
Akron2cfe8092016-06-24 17:48:49 +020077# - Added multi archive support
78# - Added prefix negation support
Akronf3f0c942016-06-27 13:27:14 +020079# - Added Malt#Dependency support
Akron8b990522016-07-06 16:45:57 +020080#
81# 2016/07/06
82# - Added MDParser#Dependency
Akron4c0cf312016-10-15 16:42:09 +020083#
84# 2016/10/15
Nils Diewald0e489772016-10-24 15:16:52 +020085# - Fixed temporary path issue in script
86#
87# 2016/10/24
88# - Improved Windows support
Akron4c0cf312016-10-15 16:42:09 +020089#
Akronb4bbec72016-10-26 20:21:02 +020090# 2016/10/24
91# - Added support for document extraction
92#
Akron3741f8b2016-12-21 19:55:21 +010093# 2016/10/27
Akron2fd402b2016-10-27 21:26:48 +020094# - Added wildcard support for document extraction
Akron2812ba22016-10-28 21:55:59 +020095#
Akron3741f8b2016-12-21 19:55:21 +010096# 2016/12/21
97# - added support for base-sentences and base-tokenizations
98#
Akron4fa37c32017-01-20 14:43:10 +010099# 2017/01/20
100# - added support for DRuKoLa annotations
101#
Akron41ac10b2017-02-08 22:47:25 +0100102# 2017/02/08
103# - added support for pagebreak annotations
104#
Akron821db3d2017-04-06 21:19:31 +0200105# 2017/04/06
106# - added support for wildcards in input
107#
Akron636aa112017-04-07 18:48:56 +0200108# 2017/04/07
109# - support configuration option
Akron81500102017-04-07 20:45:44 +0200110# - support for temporary extraction
Akron636aa112017-04-07 18:48:56 +0200111#
Akron9ec88872017-04-12 16:29:06 +0200112# 2017/04/12
Akron63f20d42017-04-10 23:40:29 +0200113# - support serial processing
114# - support input root
Akron9ec88872017-04-12 16:29:06 +0200115# - introduced --sequential-extraction flag
Akronce125b62017-06-19 11:54:36 +0200116#
117# 2017/06/19
118# - added support for DCK
Akron3abc03e2017-06-29 16:23:35 +0200119#
120# 2017/06/29
121# - Fixed exit codes
Akron9a062ce2017-07-04 19:12:05 +0200122#
123# 2017/07/04
124# - Fixed tar building process
Akron4c679192018-01-16 17:41:49 +0100125#
126# 2018/01/16
127# - Added LWC support
Akron5fdc7e12018-07-19 12:37:48 +0200128#
129# 2018/07/19
130# - Preliminary support for HNC.
Akroned9baf02019-01-22 17:03:25 +0100131#
132# 2019/01/22
Akron57510c12019-01-04 14:58:53 +0100133# - Preliminary support for DGD.
Akroned9baf02019-01-22 17:03:25 +0100134# - Support for non-word tokens.
Akron263274c2019-02-07 09:48:30 +0100135#
Akron63d03ee2019-02-13 18:49:38 +0100136# 2019/02/13
Akron263274c2019-02-07 09:48:30 +0100137# - Support for 'koral:field' array.
138# - Support for Koral versioning.
Akron63d03ee2019-02-13 18:49:38 +0100139# - Ignore temporary extract parameter on
140# directory archiving.
Akron7d5e6382019-08-08 16:36:27 +0200141#
142# 2019/08/08
143# - Support for Talismane.
Akronc29b8e12019-12-16 14:28:09 +0100144#
Akronf1849aa2019-12-16 23:35:33 +0100145# 2019/12/17
Akronc29b8e12019-12-16 14:28:09 +0100146# - Added support for DGD pseudo-sentences
147# based on anchor milestones.
Akronf1849aa2019-12-16 23:35:33 +0100148# - Support for non-verbal annotations.
Akron07e24772020-04-23 14:00:54 +0200149#
150# 2020/04/23
151# - Added support for Redewiedergabe-Korpus structure
152# annotations, based on sentence and paragraph milestones
153# - Added support for Redewiedergabe-Korpus morphology
Akron941c1a62016-02-23 17:41:41 +0100154# ----------------------------------------------------------
Akron069bd712016-02-12 19:09:06 +0100155
Akronb9c33812020-10-21 16:19:35 +0200156our $LAST_CHANGE = '2020/10/21';
Akron941c1a62016-02-23 17:41:41 +0100157our $LOCAL = $FindBin::Bin;
Akron263274c2019-02-07 09:48:30 +0100158our $KORAL_VERSION = 0.03;
Akron941c1a62016-02-23 17:41:41 +0100159our $VERSION_MSG = <<"VERSION";
160Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
161VERSION
162
Akron941c1a62016-02-23 17:41:41 +0100163# Parse comand
164my $cmd;
165our @ARGV;
166if ($ARGV[0] && index($ARGV[0], '-') != 0) {
167 $cmd = shift @ARGV;
Akron150b29e2016-02-14 23:06:48 +0100168};
Akron63f20d42017-04-10 23:40:29 +0200169my @keep_argv = @ARGV;
Akron93d620e2016-02-05 19:40:05 +0100170
Akron5f51d422016-08-16 16:26:43 +0200171my (@skip, @sigle, @anno, @input);
Akronf8df2162020-08-07 15:03:39 +0200172
173# Configuration hash
174my %cfg = ();
Akrone10ad322016-02-27 10:54:26 +0100175
Akron941c1a62016-02-23 17:41:41 +0100176# Parse options from the command line
Nils Diewald7364d1f2013-11-05 19:26:35 +0000177GetOptions(
Akron08385f62016-03-22 20:37:04 +0100178 'input|i=s' => \@input,
Akronf8df2162020-08-07 15:03:39 +0200179 'input-base|ib=s' => \($cfg{input_base}),
180 'output|o=s' => \($cfg{output}),
181 'overwrite|w' => \($cfg{overwrite}),
182 'meta|m=s' => \($cfg{meta}),
183 'token|t=s' => \($cfg{token}),
184 'base-sentences|bs=s' => \($cfg{base_sentences}),
185 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
186 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
187 'gzip|z' => \($cfg{gzip}),
188 'temporary-extract|te=s' => \($cfg{extract_dir}),
Akrone10ad322016-02-27 10:54:26 +0100189 'skip|s=s' => \@skip,
190 'sigle|sg=s' => \@sigle,
Akronf8df2162020-08-07 15:03:39 +0200191 'cache|c=s' => \($cfg{cache_file}),
Akron636aa112017-04-07 18:48:56 +0200192 'config|cfg=s' => \(my $cfg_file),
Akronf8df2162020-08-07 15:03:39 +0200193 'log|l=s' => \($cfg{log}),
Akron5f51d422016-08-16 16:26:43 +0200194 'anno|a=s' => \@anno,
Akron11daf962020-08-07 16:29:22 +0200195 'primary|p!' => sub {
196 warn 'Primary flag no longer supported!';
197 },
Akron6aed0562020-08-07 16:46:00 +0200198 'pretty|y' => sub {
199 warn 'Pretty flag no longer supported!';
200 },
Akronf8df2162020-08-07 15:03:39 +0200201 'jobs|j=i' => \($cfg{jobs}),
202 'koral|k=f' => \($cfg{koral}),
203 'to-tar' => \($cfg{to_tar}),
204 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
205 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
206 'sequential-extraction|se' => \($cfg{sequential_extraction}),
207 'cache-size|cs=s' => \($cfg{cache_size}),
208 'cache-delete|cd!' => \($cfg{cache_delete}),
209 'cache-init|ci!' => \($cfg{cache_init}),
Akron941c1a62016-02-23 17:41:41 +0100210 'help|h' => sub {
211 pod2usage(
212 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
Akron7d4cdd82016-08-17 21:39:45 +0200213 -verbose => 99,
214 -msg => $VERSION_MSG,
215 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100216 );
217 },
218 'version|v' => sub {
219 pod2usage(
Akron7d4cdd82016-08-17 21:39:45 +0200220 -verbose => 0,
221 -msg => $VERSION_MSG,
222 -output => '-'
Akron941c1a62016-02-23 17:41:41 +0100223 )
224 }
Nils Diewald7364d1f2013-11-05 19:26:35 +0000225);
226
Akrone512b7c2020-08-07 16:16:12 +0200227my %ERROR_HASH = (
228 -sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
229 -verbose => 99,
230 -msg => $VERSION_MSG,
231 -output => '-',
232 -exit => 1
233);
Akron63f20d42017-04-10 23:40:29 +0200234
Akronf8df2162020-08-07 15:03:39 +0200235# Load from configuration and fill non-given data
Akron636aa112017-04-07 18:48:56 +0200236if ($cfg_file && -e $cfg_file) {
Akron636aa112017-04-07 18:48:56 +0200237 my %config;
238
Akronf8df2162020-08-07 15:03:39 +0200239 print "Reading config from $cfg_file\n";
240
Akron636aa112017-04-07 18:48:56 +0200241 Config::Simple->import_from($cfg_file, \%config);
242
Akronf8df2162020-08-07 15:03:39 +0200243 foreach (qw!output cache-size input-base token overwrite
244 meta base-sentences base-paragraphs base-pagebreaks
245 gzip to-tar log cache non-word-tokens
246 non-verbal-tokens sequential-extraction cache-init
247 koral extract-dir jobs!) {
248 my $underlined = $_ =~ tr/-/_/r;
249 if (!defined($cfg{$underlined}) && defined $config{$_}) {
250 $cfg{$underlined} = $config{$_};
251 };
Akron636aa112017-04-07 18:48:56 +0200252 };
253
254 # Skip
255 if (!scalar(@skip) && defined $config{'skip'}) {
256 @skip = split /\s*;\s*/, $config{'skip'} ;
257 };
258
259 # Sigle
260 if (!scalar(@sigle) && defined $config{'sigle'}) {
261 @sigle = split /\s*;\s*/, $config{'sigle'} ;
262 };
263
264 # Anno
265 if (!scalar(@anno) && defined $config{'anno'}) {
266 @anno = split /\s*;\s*/, $config{'anno'} ;
267 };
268};
269
Akronf8df2162020-08-07 15:03:39 +0200270# Init variables and set default values
271my $output = $cfg{output};
272my $input_base = $cfg{input_base};
273my $gzip = $cfg{gzip};
274my $to_tar = $cfg{to_tar};
275my $extract_dir = $cfg{extract_dir};
276my $token_base = $cfg{token} // 'OpenNLP#tokens';
277my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
278my $jobs = $cfg{jobs} // 0;
279my $cache_delete = $cfg{cache_delete} // 1;
280my $base_sentences = lc($cfg{base_sentences} // '');
281my $base_paragraphs = lc($cfg{base_paragraphs} // '');
282my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
283my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron63f20d42017-04-10 23:40:29 +0200284
Akronf8df2162020-08-07 15:03:39 +0200285# Get tokenization basis
286my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron636aa112017-04-07 18:48:56 +0200287
Akronf8df2162020-08-07 15:03:39 +0200288# Remove file extension
289$token_base_layer =~ s/\.xml$//i;
Akron3741f8b2016-12-21 19:55:21 +0100290
Akronf8df2162020-08-07 15:03:39 +0200291# Convert sigle to path construct
292s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
293
294my %skip;
295$skip{lc($_)} = 1 foreach @skip;
Akron63f20d42017-04-10 23:40:29 +0200296
Akronb9c33812020-10-21 16:19:35 +0200297Log::Any::Adapter->set(
298 'Stderr', log_level => uc($cfg{log} // 'ERROR')
299);
Akron63f20d42017-04-10 23:40:29 +0200300
Akronf8df2162020-08-07 15:03:39 +0200301if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
302 $log->error("Directory '$output' does not exist.");
303 exit 1;
304};
Akron63f20d42017-04-10 23:40:29 +0200305
Akron941c1a62016-02-23 17:41:41 +0100306# Input has to be defined
Akron08385f62016-03-22 20:37:04 +0100307pod2usage(%ERROR_HASH) unless @input;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000308
Akrone1dbc382016-07-08 22:24:52 +0200309# Gzip has no effect, if no output is given
310pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald7364d1f2013-11-05 19:26:35 +0000311
Akronc11f7982017-02-21 21:20:14 +0100312
Akron63f20d42017-04-10 23:40:29 +0200313# Start serial processing
Akron28c4e542017-07-04 20:30:33 +0200314if ($cmd && $cmd eq 'serial') {
Akron63f20d42017-04-10 23:40:29 +0200315
Akron63f20d42017-04-10 23:40:29 +0200316 # Remove all inputs
317 my $remove_next = 0;
318 @keep_argv = @{c(@keep_argv)->grep(
319 sub {
320 # Input flag
321 if ($_ eq '-i' || $_ eq '--input' || $_ eq '--output' || $_ eq '-o') {
322 $remove_next = 1;
323 return 0;
324 }
325
326 # input value
327 elsif ($remove_next) {
328 $remove_next = 0;
329 return 0;
330 };
331
332 # Pass parameter
333 return 1;
334 }
335 )->to_array};
336
337
338 # Iterate over all inputs
339 foreach (@input) {
340
Akron081639e2017-04-21 19:01:39 +0200341 # This will create a directory
Akron63f20d42017-04-10 23:40:29 +0200342 my $new_out = catdir($output, get_file_name_from_glob($_));
343
Akron486f9ab2017-04-22 23:25:19 +0200344 # Create new path, in case the output is not meant to be tarred
Akron081639e2017-04-21 19:01:39 +0200345 unless ($to_tar) {
346 if (make_path($new_out) == 0 && !-d $new_out) {
347 $log->error("Can\'t create path $new_out");
Akron3abc03e2017-06-29 16:23:35 +0200348 exit 1;
Akron081639e2017-04-21 19:01:39 +0200349 };
Akron63f20d42017-04-10 23:40:29 +0200350 };
351
352 # Create archive command
353 my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
354 print "Start serial processing of $_ to $new_out\n";
355
356 # Start archiving
357 system @archive_cmd;
358 };
359
Akron3abc03e2017-06-29 16:23:35 +0200360 exit;
Akron63f20d42017-04-10 23:40:29 +0200361};
362
Akron5c602cb2020-08-07 17:00:52 +0200363# Define supported (and preinstalled) transformation modules
364my @layers = ();
Akron3741f8b2016-12-21 19:55:21 +0100365push(@layers, ['Base', 'Sentences']) unless $base_sentences;
366push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akrone1dbc382016-07-08 22:24:52 +0200367
368# Connexor
Akron5c602cb2020-08-07 17:00:52 +0200369push(@layers, ['Connexor', 'Morpho'],
370 ['Connexor', 'Syntax'],
371 ['Connexor', 'Phrase'],
372 ['Connexor', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200373
374# CoreNLP
Akron5c602cb2020-08-07 17:00:52 +0200375push(@layers,
376 ['CoreNLP', 'NamedEntities'],
377 ['CoreNLP', 'Sentences'],
378 ['CoreNLP', 'Morpho'],
379 ['CoreNLP', 'Constituency']);
Akrone1dbc382016-07-08 22:24:52 +0200380
Akronce125b62017-06-19 11:54:36 +0200381# CMC
382push(@layers, ['CMC', 'Morpho']);
Akron3741f8b2016-12-21 19:55:21 +0100383
Akrone1dbc382016-07-08 22:24:52 +0200384# DeReKo
Akron41ac10b2017-02-08 22:47:25 +0100385my @dereko_attr = ();
386if ($base_sentences eq 'dereko#structure') {
387 push @dereko_attr, 'sentences';
388};
389if ($base_paragraphs eq 'dereko#structure') {
390 push @dereko_attr, 'paragraphs';
391};
Akron636bd9c2017-02-09 17:13:00 +0100392
Akron41ac10b2017-02-08 22:47:25 +0100393if ($base_pagebreaks eq 'dereko#structure') {
394 push @dereko_attr, 'pagebreaks';
395};
396
397if ($dereko_attr[0]) {
398 push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron3741f8b2016-12-21 19:55:21 +0100399}
400else {
401 push(@layers, ['DeReKo', 'Structure']);
402};
Akrone1dbc382016-07-08 22:24:52 +0200403
Akron57510c12019-01-04 14:58:53 +0100404# DGD
405push(@layers, ['DGD', 'Morpho']);
Akronc29b8e12019-12-16 14:28:09 +0100406if ($base_sentences eq 'dgd#structure') {
407 push(@layers, ['DGD', 'Structure', 'base-sentence']);
408}
Akron57510c12019-01-04 14:58:53 +0100409
410# DRuKoLa
Akron5c602cb2020-08-07 17:00:52 +0200411push(@layers,
412 ['DRuKoLa', 'Morpho']);
Akron57510c12019-01-04 14:58:53 +0100413
Akrone1dbc382016-07-08 22:24:52 +0200414# Glemm
Akron5c602cb2020-08-07 17:00:52 +0200415push(@layers,
416 ['Glemm', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200417
Akronea1aed52018-07-19 14:43:34 +0200418# HNC
Akron5c602cb2020-08-07 17:00:52 +0200419push(@layers,
420 ['HNC', 'Morpho']);
Akronea1aed52018-07-19 14:43:34 +0200421
Akron4c679192018-01-16 17:41:49 +0100422# LWC
Akron5c602cb2020-08-07 17:00:52 +0200423push(@layers,
424 ['LWC', 'Dependency']);
Akron4c679192018-01-16 17:41:49 +0100425
Akrone1dbc382016-07-08 22:24:52 +0200426# Malt
Akron5c602cb2020-08-07 17:00:52 +0200427push(@layers,
428 ['Malt', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200429
Akron57510c12019-01-04 14:58:53 +0100430# Marmot
Akron5c602cb2020-08-07 17:00:52 +0200431push(@layers,
432 ['MarMoT', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200433
434# Mate
Akron5c602cb2020-08-07 17:00:52 +0200435push(@layers,
436 ['Mate', 'Morpho'],
437 ['Mate', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200438
Akron57510c12019-01-04 14:58:53 +0100439# MDParser
Akron5c602cb2020-08-07 17:00:52 +0200440push(@layers,
441 ['MDParser', 'Dependency']);
Akron57510c12019-01-04 14:58:53 +0100442
Akrone1dbc382016-07-08 22:24:52 +0200443# OpenNLP
Akron5c602cb2020-08-07 17:00:52 +0200444push(@layers,
445 ['OpenNLP', 'Morpho'],
446 ['OpenNLP', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200447
Akron07e24772020-04-23 14:00:54 +0200448# Redewiedergabe
449push(@layers, ['RWK', 'Morpho']);
450if ($base_sentences eq 'rwk#structure') {
451 push(@layers, ['RWK', 'Structure']);
452};
453
Akrone1dbc382016-07-08 22:24:52 +0200454# Schreibgebrauch
Akron5c602cb2020-08-07 17:00:52 +0200455push(@layers,
456 ['Sgbr', 'Lemma'],
457 ['Sgbr', 'Morpho']);
Akrone1dbc382016-07-08 22:24:52 +0200458
Akron7d5e6382019-08-08 16:36:27 +0200459# Talismane
Akron5c602cb2020-08-07 17:00:52 +0200460push(@layers,
461 ['Talismane', 'Dependency'],
462 ['Talismane', 'Morpho']);
Akron7d5e6382019-08-08 16:36:27 +0200463
Akrone1dbc382016-07-08 22:24:52 +0200464# TreeTagger
Akron5c602cb2020-08-07 17:00:52 +0200465push(@layers,
466 ['TreeTagger', 'Morpho'],
467 ['TreeTagger', 'Sentences']);
Akrone1dbc382016-07-08 22:24:52 +0200468
469# XIP
Akron5c602cb2020-08-07 17:00:52 +0200470push(@layers,
471 ['XIP', 'Morpho'],
472 ['XIP', 'Constituency'],
473 ['XIP', 'Sentences'],
474 ['XIP', 'Dependency']);
Akrone1dbc382016-07-08 22:24:52 +0200475
Akron4fa37c32017-01-20 14:43:10 +0100476
Akrone1dbc382016-07-08 22:24:52 +0200477# Check filters
478my @filtered_anno;
479if ($skip{'#all'}) {
480 foreach (@anno) {
481 push @filtered_anno, [ split('#', $_) ];
482 };
483}
484
485# Add all annotations that are not skipped
486else {
487 # Add to index file - respect skipping
488 foreach my $info (@layers) {
489 # Skip if Foundry or Foundry#Layer should be skipped
490 unless ($skip{lc($info->[0])} || $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
491 push @filtered_anno, $info;
492 };
493 };
494};
495
Akrone1dbc382016-07-08 22:24:52 +0200496
497# TODO: This should not be initialized for batch
498my $cache = Cache::FastMmap->new(
499 share_file => $cache_file,
Akronf8df2162020-08-07 15:03:39 +0200500 cache_size => ($cfg{cache_size} // '50m'),
501 init_file => ($cfg{cache_init} // 1)
Akrone1dbc382016-07-08 22:24:52 +0200502);
503
Akron03b24db2016-08-16 20:54:32 +0200504# Create batch object
Akrone1dbc382016-07-08 22:24:52 +0200505my $batch_file = KorAP::XML::Batch::File->new(
Akron03b24db2016-08-16 20:54:32 +0200506 cache => $cache,
Akronf8df2162020-08-07 15:03:39 +0200507 meta_type => $cfg{meta},
508 overwrite => $cfg{overwrite},
Akron03b24db2016-08-16 20:54:32 +0200509 foundry => $token_base_foundry,
510 layer => $token_base_layer,
511 gzip => $gzip,
512 log => $log,
Akronf8df2162020-08-07 15:03:39 +0200513 koral => ($cfg{koral} // $KORAL_VERSION),
Akroned9baf02019-01-22 17:03:25 +0100514 anno => \@filtered_anno,
Akronf8df2162020-08-07 15:03:39 +0200515 non_word_tokens => ($cfg{non_word_tokens} // 0),
516 non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akrone1dbc382016-07-08 22:24:52 +0200517);
518
Akrone512b7c2020-08-07 16:16:12 +0200519
520# Auto adjust jobs
521if ($jobs eq '-1') {
Akron0b04b312020-10-30 17:39:18 +0100522 my $cores = 1;
523 if (eval("use Sys::Info; 1;") && eval("use Sys::Info::Constants qw( :device_cpu ); 1;")) {
524 $cores = Sys::Info->new->device('CPU')->count;
525 }
526 else {
527 $log->warn("Unable to determine number of cores");
528 };
529
Akrone512b7c2020-08-07 16:16:12 +0200530 $jobs = ceil(5 * $cores);
531 $log->info("Run using $jobs jobs on $cores cores");
532};
533
534
Akron63f20d42017-04-10 23:40:29 +0200535# Glob and prefix files
Akrone512b7c2020-08-07 16:16:12 +0200536if (@input > 0) {
Akron63f20d42017-04-10 23:40:29 +0200537
Akron821db3d2017-04-06 21:19:31 +0200538 my @new_input = ();
539
540 # Iterate over all inputs
Akron63f20d42017-04-10 23:40:29 +0200541 foreach my $wild_card (@input) {
542
543 # Prefix with input root
544 $wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
545
546 push (@new_input, bsd_glob($wild_card));
Akron821db3d2017-04-06 21:19:31 +0200547 };
548
Akron63f20d42017-04-10 23:40:29 +0200549 # Sort files by length
550 @input = sort { length($a) <=> length($b) } @new_input;
551
552 print 'Input is ' . join(', ', @input)."\n";
Akron821db3d2017-04-06 21:19:31 +0200553};
554
555
Akron941c1a62016-02-23 17:41:41 +0100556# Process a single file
557unless ($cmd) {
Akron08385f62016-03-22 20:37:04 +0100558 my $input = $input[0];
Nils Diewald59094f22014-11-05 18:20:50 +0000559
Akron941c1a62016-02-23 17:41:41 +0100560 BEGIN {
561 $main::TIME = Benchmark->new;
562 $main::LAST_STOP = Benchmark->new;
563 };
564
565 sub stop_time {
566 my $new = Benchmark->new;
Akron5f51d422016-08-16 16:26:43 +0200567 $log->info(
Akron941c1a62016-02-23 17:41:41 +0100568 'The code took: '.
Akron5f51d422016-08-16 16:26:43 +0200569 timestr(timediff($new, $main::LAST_STOP)) .
570 ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
571 );
Akron941c1a62016-02-23 17:41:41 +0100572 $main::LAST_STOP = $new;
573 };
574
575 # Create and parse new document
Akrone512b7c2020-08-07 16:16:12 +0200576 $input =~ s{([^/])$}{$1/}o;
Akron941c1a62016-02-23 17:41:41 +0100577
Akron7d4cdd82016-08-17 21:39:45 +0200578 # Process file
Akrone1dbc382016-07-08 22:24:52 +0200579 $batch_file->process($input, $output);
Akron941c1a62016-02-23 17:41:41 +0100580
Akron11c80302016-03-18 19:44:43 +0100581 # Delete cache file
582 unlink($cache_file) if $cache_delete;
583
Akron5f51d422016-08-16 16:26:43 +0200584 stop_time;
Akron3abc03e2017-06-29 16:23:35 +0200585 exit;
Akron81500102017-04-07 20:45:44 +0200586};
587
Nils Diewald59094f22014-11-05 18:20:50 +0000588
Akrone10ad322016-02-27 10:54:26 +0100589# Extract XML files
Akron81500102017-04-07 20:45:44 +0200590if ($cmd eq 'extract') {
Akrone10ad322016-02-27 10:54:26 +0100591
Akrond5643ad2017-07-04 20:27:13 +0200592 # Output is required
593 pod2usage(%ERROR_HASH) unless $output;
594
Akron7d4cdd82016-08-17 21:39:45 +0200595 # Create new archive object
Akronb0c88db2016-06-29 16:33:18 +0200596 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone10ad322016-02-27 10:54:26 +0100597
Akron7d4cdd82016-08-17 21:39:45 +0200598 # Check zip capabilities
Akrone10ad322016-02-27 10:54:26 +0100599 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200600 $log->error("Unzip is not installed or incompatible.");
601 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100602 };
603
Akronb0c88db2016-06-29 16:33:18 +0200604 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200605 $archive->attach($_) foreach @input[1..$#input];
Akronb0c88db2016-06-29 16:33:18 +0200606
Akron31a08cb2019-02-20 20:43:26 +0100607 # Will set @sigle
608 my $prefix = set_sigle($archive);
Akron651cb8d2016-08-16 21:44:49 +0200609
Akrone10ad322016-02-27 10:54:26 +0100610 # Iterate over all given sigles and extract
611 foreach (@sigle) {
Akron60a8caa2017-02-17 21:51:27 +0100612
Akron2812ba22016-10-28 21:55:59 +0200613 print "$_ ...\n";
Akron7d4cdd82016-08-17 21:39:45 +0200614
Akron03b24db2016-08-16 20:54:32 +0200615 # TODO: Make this OS independent
Akron2812ba22016-10-28 21:55:59 +0200616 print '... ' . (
Akron60a8caa2017-02-17 21:51:27 +0100617
Akron955b75b2019-02-21 14:28:41 +0100618 # TODO:
619 # - prefix???
620 $archive->extract_sigle([$_], $output, $jobs)
621 ? '' : 'not '
Akron651cb8d2016-08-16 21:44:49 +0200622 );
Akrone10ad322016-02-27 10:54:26 +0100623 print "extracted.\n";
624 };
Akronb0c88db2016-06-29 16:33:18 +0200625 }
Akron7d4cdd82016-08-17 21:39:45 +0200626
627 # Can't create archive object
Akronb0c88db2016-06-29 16:33:18 +0200628 else {
629 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200630 exit 1;
Akrone10ad322016-02-27 10:54:26 +0100631 };
632}
633
Akron81500102017-04-07 20:45:44 +0200634
Akron941c1a62016-02-23 17:41:41 +0100635# Process an archive
636elsif ($cmd eq 'archive') {
Nils Diewald2db9ad02013-10-29 19:26:43 +0000637
Akron81500102017-04-07 20:45:44 +0200638 my $archive_output;
639
640 # First extract, then archive
Akron63d03ee2019-02-13 18:49:38 +0100641 if (defined $extract_dir && !-d $input[0]) {
Akron81500102017-04-07 20:45:44 +0200642
643 # Create new archive object
644 if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
645
646 # Check zip capabilities
647 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200648 $log->error("Unzip is not installed or incompatible.");
649 exit 1;
Akron81500102017-04-07 20:45:44 +0200650 };
651
652 # Add further annotation archived
653 $archive->attach($_) foreach @input[1..$#input];
654
655 # Create a temporary directory
656 if ($extract_dir eq ':temp:') {
Akron63f20d42017-04-10 23:40:29 +0200657 $extract_dir = tempdir(CLEANUP => 0);
658 print "Temporarily extract to $extract_dir\n";
Akron81500102017-04-07 20:45:44 +0200659 };
660
Akron63f20d42017-04-10 23:40:29 +0200661 # Add some random extra to avoid clashes with multiple archives
662 $extract_dir = catdir($extract_dir, random_string('cccccc'));
663
Akron31a08cb2019-02-20 20:43:26 +0100664 # Extract to temporary directory
Akron9ec88872017-04-12 16:29:06 +0200665 if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron81500102017-04-07 20:45:44 +0200666 @input = ($extract_dir);
667 }
668 else {
669 $log->error('Unable to extract from primary archive ' . $input[0] .
670 ' to ' . $extract_dir);
Akron3abc03e2017-06-29 16:23:35 +0200671 exit 1;
Akron81500102017-04-07 20:45:44 +0200672 };
673 }
674
675 # Can't create archive object
676 else {
677 $log->error('Unable to extract from primary archive ' . $input[0]);
Akron3abc03e2017-06-29 16:23:35 +0200678 exit 1;
Akron81500102017-04-07 20:45:44 +0200679 };
680 };
681
Akron7d4cdd82016-08-17 21:39:45 +0200682 # Zero means: everything runs in the parent process
Akron941c1a62016-02-23 17:41:41 +0100683 my $pool = Parallel::ForkManager->new($jobs);
684
Akron7d4cdd82016-08-17 21:39:45 +0200685 my $count = 0; # Texts to process
Akron941c1a62016-02-23 17:41:41 +0100686 my $iter = 1; # Current text in process
687
Akronda3097e2017-04-23 19:53:57 +0200688 my $tar_archive;
689 my $output_dir = $output;
690 my $tar_fh;
691
692 # Initialize tar archive
693 if ($to_tar) {
694 $tar_archive = Archive::Tar::Builder->new(
695 ignore_errors => 1
696 );
697
698 # Set output name
699 my $tar_file = $output;
700 unless ($tar_file =~ /\.tar$/) {
701 $tar_file .= '.tar';
702 };
703
704 # Initiate the tar file
705 print "Writing to file $tar_file\n";
706 $tar_fh = IO::File->new($tar_file, 'w');
707 $tar_fh->binmode(1);
708
709 # Set handle
710 $tar_archive->set_handle($tar_fh);
711
712 # Output to temporary directory
713 $output_dir = File::Temp->newdir;
714 };
715
Akron941c1a62016-02-23 17:41:41 +0100716 # Report on fork message
717 $pool->run_on_finish (
718 sub {
Akron7d4cdd82016-08-17 21:39:45 +0200719 my ($pid, $code) = @_;
Akron941c1a62016-02-23 17:41:41 +0100720 my $data = pop;
Akron7d4cdd82016-08-17 21:39:45 +0200721
Akron08385f62016-03-22 20:37:04 +0100722 print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron651cb8d2016-08-16 21:44:49 +0200723 ($iter++) . "/$count]" .
724 ($code ? " $code" : '') .
Akron4c0cf312016-10-15 16:42:09 +0200725 ' ' . $data->[0] . "\n";
Akronda3097e2017-04-23 19:53:57 +0200726
727 if (!$code && $to_tar && $data->[2]) {
728 my $filename = $data->[2];
729
730 # Lock filehandle
731 if (flock($tar_fh, LOCK_EX)) {
732
Akron9a062ce2017-07-04 19:12:05 +0200733 my $clean_file = fileparse($filename);
734
Akronda3097e2017-04-23 19:53:57 +0200735 # Archive and remove file
Akron9a062ce2017-07-04 19:12:05 +0200736 $tar_archive->archive_as($filename => $clean_file);
Akronda3097e2017-04-23 19:53:57 +0200737 unlink $filename;
738
739 # Unlock filehandle
740 flock($tar_fh, LOCK_UN);
741 }
742 else {
743 $log->warn("Unable to add $filename to archive");
744 };
745 };
746
Akron4c0cf312016-10-15 16:42:09 +0200747 $data->[1] = undef if $data->[1];
Akron941c1a62016-02-23 17:41:41 +0100748 }
749 );
750
751 my $t;
Akron7d4cdd82016-08-17 21:39:45 +0200752 my $temp;
Akron941c1a62016-02-23 17:41:41 +0100753 print "Reading data ...\n";
754
Akron7d4cdd82016-08-17 21:39:45 +0200755 # unless (Cache::FastMmap->new(
756 # share_file => $cache_file,
757 # cache_size => $cache_size,
758 # init_file => $cache_init
759 # )) {
760 # print "Unable to intialize cache '$cache_file'\n\n";
761 # exit(1);
762 # };
Akron11c80302016-03-18 19:44:43 +0100763
Akron486f9ab2017-04-22 23:25:19 +0200764
Akron941c1a62016-02-23 17:41:41 +0100765 # Input is a directory
Akron08385f62016-03-22 20:37:04 +0100766 if (-d $input[0]) {
Akron5c602cb2020-08-07 17:00:52 +0200767 # TODO:
768 # Replace with Mojo::File
Akron08385f62016-03-22 20:37:04 +0100769 my $it = Directory::Iterator->new($input[0]);
Akron941c1a62016-02-23 17:41:41 +0100770 my @dirs;
771 my $dir;
772
Akron7d4cdd82016-08-17 21:39:45 +0200773 # Todo: Make a DO WHILE
Akron941c1a62016-02-23 17:41:41 +0100774 while (1) {
775 if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron7d4cdd82016-08-17 21:39:45 +0200776 push @dirs, $dir;
777 $it->prune;
Akron941c1a62016-02-23 17:41:41 +0100778 };
779 last unless $it->next;
780 };
781
782 print "Start processing ...\n";
783 $t = Benchmark->new;
784 $count = scalar @dirs;
785
786 DIRECTORY_LOOP:
787 for (my $i = 0; $i < $count; $i++) {
788
Akrone1dbc382016-07-08 22:24:52 +0200789 my $filename = catfile(
Akron081639e2017-04-21 19:01:39 +0200790 $output_dir,
Akron41127e32020-08-07 12:46:19 +0200791 get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akrone1dbc382016-07-08 22:24:52 +0200792 );
Akron941c1a62016-02-23 17:41:41 +0100793
794 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200795 $pool->start and next DIRECTORY_LOOP;
Akron3ec48972016-08-17 23:24:52 +0200796
Akron13d56622016-10-31 14:54:49 +0100797 if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200798 $pool->finish(
799 0,
Akronda3097e2017-04-23 19:53:57 +0200800 [
801 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
802 undef,
803 $filename
804 ]
Akron486f9ab2017-04-22 23:25:19 +0200805 );
Akron3ec48972016-08-17 23:24:52 +0200806 }
807 else {
Akron4c0cf312016-10-15 16:42:09 +0200808 $pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron3ec48972016-08-17 23:24:52 +0200809 };
Akron941c1a62016-02-23 17:41:41 +0100810 };
811 }
812
813 # Input is a file
Akron29866ac2016-06-24 16:40:47 +0200814 elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akrone1dbc382016-07-08 22:24:52 +0200815
Akron941c1a62016-02-23 17:41:41 +0100816 unless ($archive->test_unzip) {
Akron3abc03e2017-06-29 16:23:35 +0200817 $log->error("Unzip is not installed or incompatible.");
818 exit 1;
Akron941c1a62016-02-23 17:41:41 +0100819 };
820
Akron08385f62016-03-22 20:37:04 +0100821 # Add further annotation archived
Akron2812ba22016-10-28 21:55:59 +0200822 $archive->attach($_) foreach @input[1..$#input];
Akron08385f62016-03-22 20:37:04 +0100823
Akron31a08cb2019-02-20 20:43:26 +0100824 # Get sigles to extract
825 my $prefix = set_sigle($archive);
826
Akron941c1a62016-02-23 17:41:41 +0100827 print "Start processing ...\n";
828 $t = Benchmark->new;
829 my @dirs = $archive->list_texts;
830 $count = scalar @dirs;
831
832 ARCHIVE_LOOP:
833 for (my $i = 0; $i < $count; $i++) {
834
835 # Split path information
836 my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
837
Akrone1dbc382016-07-08 22:24:52 +0200838 my $filename = catfile(
Akron486f9ab2017-04-22 23:25:19 +0200839 $output_dir,
Akron7d4cdd82016-08-17 21:39:45 +0200840 get_file_name(
Akron41127e32020-08-07 12:46:19 +0200841 $input[0],
Akron7d4cdd82016-08-17 21:39:45 +0200842 catfile($corpus, $doc, $text)
843 . '.json' . ($gzip ? '.gz' : '')
844 )
Akrone1dbc382016-07-08 22:24:52 +0200845 );
Akron941c1a62016-02-23 17:41:41 +0100846
847 # Get the next fork
Akron7d4cdd82016-08-17 21:39:45 +0200848 $pool->start and next ARCHIVE_LOOP;
Akron941c1a62016-02-23 17:41:41 +0100849
Akron4c0cf312016-10-15 16:42:09 +0200850 # Create temporary file
851 $temp = File::Temp->newdir;
852
Akronbdf434a2016-10-24 17:42:07 +0200853 # TODO: Check if $filename exist at the beginning,
854 # because extraction can be horrible slow!
855
Akron941c1a62016-02-23 17:41:41 +0100856 # Extract from archive
Akron955b75b2019-02-21 14:28:41 +0100857 if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron941c1a62016-02-23 17:41:41 +0100858
Akron7d4cdd82016-08-17 21:39:45 +0200859 # Create corpus directory
860 my $input = catdir("$temp", $corpus);
Akron941c1a62016-02-23 17:41:41 +0100861
Akron7d4cdd82016-08-17 21:39:45 +0200862 # Temporary directory
863 my $dir = catdir($input, $doc, $text);
Akron941c1a62016-02-23 17:41:41 +0100864
Akron7d4cdd82016-08-17 21:39:45 +0200865 # Write file
Akron13d56622016-10-31 14:54:49 +0100866 if (my $return = $batch_file->process($dir => $filename)) {
Akron486f9ab2017-04-22 23:25:19 +0200867
Akron4c0cf312016-10-15 16:42:09 +0200868 # Delete temporary file
Akron13d56622016-10-31 14:54:49 +0100869 $pool->finish(
870 0,
Akronda3097e2017-04-23 19:53:57 +0200871 [
872 "Processed " . $filename . ($return == -1 ? " - already existing" : ''),
873 $temp,
874 $filename
875 ]
Akron13d56622016-10-31 14:54:49 +0100876 );
Akron7d4cdd82016-08-17 21:39:45 +0200877 }
878 else {
Akron4c0cf312016-10-15 16:42:09 +0200879 # Delete temporary file
880 $pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron7d4cdd82016-08-17 21:39:45 +0200881 };
Akron941c1a62016-02-23 17:41:41 +0100882 }
Akron7d4cdd82016-08-17 21:39:45 +0200883
884 # Unable to extract
Akron941c1a62016-02-23 17:41:41 +0100885 else {
Akron4c0cf312016-10-15 16:42:09 +0200886 $pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron941c1a62016-02-23 17:41:41 +0100887 };
888 };
889 }
890
891 else {
892 print "Input is neither a directory nor an archive.\n\n";
893 };
894
895 $pool->wait_all_children;
896
Akron11c80302016-03-18 19:44:43 +0100897 # Delete cache file
898 unlink($cache_file) if $cache_delete;
899
Akronda3097e2017-04-23 19:53:57 +0200900 # Close tar filehandle
901 if ($to_tar && $tar_fh) {
902 $tar_archive->finish;
903 $tar_fh->close;
904 print "Wrote to tar archive.\n";
905 };
906
Akron63f20d42017-04-10 23:40:29 +0200907 print timestr(timediff(Benchmark->new, $t))."\n";
Akron941c1a62016-02-23 17:41:41 +0100908 print "Done.\n";
Akron81500102017-04-07 20:45:44 +0200909};
Akron941c1a62016-02-23 17:41:41 +0100910
Nils Diewald2db9ad02013-10-29 19:26:43 +0000911
Akron31a08cb2019-02-20 20:43:26 +0100912# For an archive, this will create the list
913# of all sigles to process
914sub set_sigle {
915 my $archive = shift;
916
917 my $prefix = 1;
918 my @dirs = ();
919
920 # No sigles given
921 unless (@sigle) {
922
923 # Get files
924 foreach ($archive->list_texts) {
925
926 push @dirs, $_;
927
928 # Split path information
929 ($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
930
931 # TODO: Make this OS independent
932 push @sigle, join '/', $corpus, $doc, $text;
933 };
934 }
935
936 # Check sigle for doc sigles
937 else {
938 my @new_sigle;
939
940 my $prefix_check = 0;
941
942 # Iterate over all sigle
943 foreach (@sigle) {
944
945 # Sigle is a doc sigle
946 if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
947
948 print "$_ ...";
949 # Check if a prefix is needed
950 unless ($prefix_check) {
951
952 if ($prefix = $archive->check_prefix) {
953 print " with prefix ...";
954 };
955 $prefix_check = 1;
956 };
957
958 print "\n";
959
Akron31a08cb2019-02-20 20:43:26 +0100960 print '... ' . (
Akron955b75b2019-02-21 14:28:41 +0100961 $archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
962 ? '' : 'not '
Akron31a08cb2019-02-20 20:43:26 +0100963 );
964 print "extracted.\n";
965 }
966
967 # Sigle is a text sigle
968 else {
969 push @new_sigle, $_;
970
971 unless ($prefix_check) {
972
973 if ($prefix = $archive->check_prefix) {
974 print " with prefix ...";
975 };
976 $prefix_check = 1;
977 };
978 };
979 };
980 @sigle = @new_sigle;
981 };
982
983 return $prefix;
984};
985
986
Akron63f20d42017-04-10 23:40:29 +0200987# Cleanup temporary extraction directory
988if ($extract_dir) {
989 my $objects = remove_tree($extract_dir, { safe => 1 });
Akronf8df2162020-08-07 15:03:39 +0200990 $log->info("Removed directory $extract_dir with $objects objects");
Akron63f20d42017-04-10 23:40:29 +0200991};
992
993
994print "\n";
995
Nils Diewald2db9ad02013-10-29 19:26:43 +0000996__END__
Akron941c1a62016-02-23 17:41:41 +0100997
998=pod
999
1000=encoding utf8
1001
1002=head1 NAME
1003
Akron42f48c12020-02-14 13:08:13 +01001004korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron941c1a62016-02-23 17:41:41 +01001005
1006
1007=head1 SYNOPSIS
1008
Akrona76d8352016-10-27 16:27:32 +02001009 korapxml2krill [archive|extract] --input <directory|archive> [options]
Akron941c1a62016-02-23 17:41:41 +01001010
Akron2fd402b2016-10-27 21:26:48 +02001011
Akron941c1a62016-02-23 17:41:41 +01001012=head1 DESCRIPTION
1013
1014L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
1015compatible with the L<Krill|https://github.com/KorAP/Krill> indexer.
Akron8f69d632020-01-15 16:58:11 +01001016The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron941c1a62016-02-23 17:41:41 +01001017
1018
1019=head1 INSTALLATION
1020
1021The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm|App::cpanminus>.
1022
Akronaf386982016-10-12 00:33:25 +02001023 $ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron941c1a62016-02-23 17:41:41 +01001024
Akronc13a1702016-03-15 19:33:14 +01001025In case everything went well, the C<korapxml2krill> tool will
Akronf7ad89e2016-03-16 18:22:47 +01001026be available on your command line immediately.
Akron6eff23b2018-09-24 10:31:20 +02001027Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akron0b04b312020-10-30 17:39:18 +01001028Optional support for L<Sys::Info> to calculate available cores.
Akrona93d51b2016-10-24 20:27:48 +02001029In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron941c1a62016-02-23 17:41:41 +01001030
1031=head1 ARGUMENTS
1032
Akrona76d8352016-10-27 16:27:32 +02001033 $ korapxml2krill -z --input <directory> --output <filename>
1034
1035Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron2fd402b2016-10-27 21:26:48 +02001036It expects the input to point to the text level folder.
Akron7606afa2016-10-25 16:23:49 +02001037
Akron941c1a62016-02-23 17:41:41 +01001038=over 2
1039
1040=item B<archive>
1041
Akron081639e2017-04-21 19:01:39 +02001042 $ korapxml2krill archive -z --input <directory|archive> --output <directory|tar>
Akrona76d8352016-10-27 16:27:32 +02001043
Akron2fd402b2016-10-27 21:26:48 +02001044Converts an archive of KorAP-XML documents. It expects a directory
1045(pointing to the corpus level folder) or one or more zip files as input.
Akrone10ad322016-02-27 10:54:26 +01001046
1047=item B<extract>
1048
Akrona76d8352016-10-27 16:27:32 +02001049 $ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
1050
1051Extracts KorAP-XML documents from a zip file.
Akron941c1a62016-02-23 17:41:41 +01001052
Akron63f20d42017-04-10 23:40:29 +02001053=item B<serial>
1054
1055 $ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
1056
1057Convert archives sequentially. The inputs are not merged but treated
1058as they are (so they may be premerged or globs).
1059the C<--out> directory is treated as the base directory where subdirectories
Akron081639e2017-04-21 19:01:39 +02001060are created based on the archive name. In case the C<--to-tar> flag is given,
1061the output will be a tar file.
Akron63f20d42017-04-10 23:40:29 +02001062
1063
Akron941c1a62016-02-23 17:41:41 +01001064=back
1065
1066
1067=head1 OPTIONS
1068
1069=over 2
1070
Akrona76d8352016-10-27 16:27:32 +02001071=item B<--input|-i> <directory|zip file>
Akron941c1a62016-02-23 17:41:41 +01001072
Akrona76d8352016-10-27 16:27:32 +02001073Directory or zip file(s) of documents to convert.
Akron941c1a62016-02-23 17:41:41 +01001074
Akron7606afa2016-10-25 16:23:49 +02001075Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akronf1a1de92016-11-02 17:32:12 +01001076document, while C<archive> expects a KorAP-XML corpus folder or a zip
1077file to batch process multiple files.
1078C<extract> expects zip files only.
Akron7606afa2016-10-25 16:23:49 +02001079
Akrona76d8352016-10-27 16:27:32 +02001080C<archive> supports multiple input zip files with the constraint,
Akron2cfe8092016-06-24 17:48:49 +02001081that the first archive listed contains all primary data files
1082and all meta data files.
Akron2cfe8092016-06-24 17:48:49 +02001083
Akron7606afa2016-10-25 16:23:49 +02001084 -i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron2cfe8092016-06-24 17:48:49 +02001085
Akron821db3d2017-04-06 21:19:31 +02001086Input may also be defined using BSD glob wildcards.
1087
1088 -i 'file/news*.zip'
1089
1090The extended input array will be sorted in length order, so the shortest
1091path needs to contain all primary data files and all meta data files.
1092
Akron0c3e3752016-06-28 15:55:53 +02001093(The directory structure follows the base directory format,
1094that may include a C<.> root folder.
1095In this case further archives lacking a C<.> root folder
Akron7606afa2016-10-25 16:23:49 +02001096need to be passed with a hash sign in front of the archive's name.
1097This may require to quote the parameter.)
Akron2cfe8092016-06-24 17:48:49 +02001098
Akron7606afa2016-10-25 16:23:49 +02001099To support zip files, a version of C<unzip> needs to be installed that is
1100compatible with the archive file.
Akrona93d51b2016-10-24 20:27:48 +02001101
Akron7606afa2016-10-25 16:23:49 +02001102B<The root folder switch using the hash sign is experimental and
1103may vanish in future versions.>
Akron651cb8d2016-08-16 21:44:49 +02001104
Akronf73ffb62018-06-27 12:13:59 +02001105
Akron63f20d42017-04-10 23:40:29 +02001106=item B<--input-base|-ib> <directory>
1107
1108The base directory for inputs.
1109
1110
Akron941c1a62016-02-23 17:41:41 +01001111=item B<--output|-o> <directory|file>
1112
1113Output folder for archive processing or
1114document name for single output (optional),
Akronf7ad89e2016-03-16 18:22:47 +01001115writes to C<STDOUT> by default
1116(in case C<output> is not mandatory due to further options).
Akron941c1a62016-02-23 17:41:41 +01001117
1118=item B<--overwrite|-w>
1119
1120Overwrite files that already exist.
1121
Akronf73ffb62018-06-27 12:13:59 +02001122
Akron3741f8b2016-12-21 19:55:21 +01001123=item B<--token|-t> <foundry>#<file>
Akron941c1a62016-02-23 17:41:41 +01001124
1125Define the default tokenization by specifying
1126the name of the foundry and optionally the name
Akronc13a1702016-03-15 19:33:14 +01001127of the layer-file. Defaults to C<OpenNLP#tokens>.
Akronf1849aa2019-12-16 23:35:33 +01001128This will directly take the file instead of running
1129the layer implementation!
Akron941c1a62016-02-23 17:41:41 +01001130
Akron3741f8b2016-12-21 19:55:21 +01001131
1132=item B<--base-sentences|-bs> <foundry>#<layer>
1133
1134Define the layer for base sentences.
1135If given, this will be used instead of using C<Base#Sentences>.
Akronc29b8e12019-12-16 14:28:09 +01001136Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1137layers supported.
Akron3741f8b2016-12-21 19:55:21 +01001138
1139 Defaults to unset.
1140
1141
1142=item B<--base-paragraphs|-bp> <foundry>#<layer>
1143
1144Define the layer for base paragraphs.
1145If given, this will be used instead of using C<Base#Paragraphs>.
Akron0ffbd522021-02-16 12:01:19 +01001146Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
1147layer supported.
Akron3741f8b2016-12-21 19:55:21 +01001148
1149 Defaults to unset.
1150
1151
Akron41ac10b2017-02-08 22:47:25 +01001152=item B<--base-pagebreaks|-bpb> <foundry>#<layer>
1153
1154Define the layer for base pagebreaks.
1155Currently C<DeReKo#Structure> is the only layer supported.
1156
1157 Defaults to unset.
1158
1159
Akron941c1a62016-02-23 17:41:41 +01001160=item B<--skip|-s> <foundry>[#<layer>]
1161
Akronf7ad89e2016-03-16 18:22:47 +01001162Skip specific annotations by specifying the foundry
1163(and optionally the layer with a C<#>-prefix),
1164e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron941c1a62016-02-23 17:41:41 +01001165Can be set multiple times.
1166
Akronf73ffb62018-06-27 12:13:59 +02001167
Akronc13a1702016-03-15 19:33:14 +01001168=item B<--anno|-a> <foundry>#<layer>
Akron941c1a62016-02-23 17:41:41 +01001169
Akronf7ad89e2016-03-16 18:22:47 +01001170Convert specific annotations by specifying the foundry
1171(and optionally the layer with a C<#>-prefix),
1172e.g. C<Mate> or C<Mate#Morpho>.
1173Can be set multiple times.
Akron941c1a62016-02-23 17:41:41 +01001174
Akronf73ffb62018-06-27 12:13:59 +02001175
Akroned9baf02019-01-22 17:03:25 +01001176=item B<--non-word-tokens|-nwt>
1177
1178Tokenize non-word tokens like word tokens (defined as matching
1179C</[\d\w]/>). Useful to treat punctuations as tokens.
1180
1181 Defaults to unset.
1182
Akronf1849aa2019-12-16 23:35:33 +01001183
1184=item B<--non-verbal-tokens|-nvt>
1185
1186Tokenize non-verbal tokens marked as in the primary data as
1187the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
1188
1189 Defaults to unset.
1190
1191
Akron941c1a62016-02-23 17:41:41 +01001192=item B<--jobs|-j>
1193
1194Define the number of concurrent jobs in seperated forks
Akronf7ad89e2016-03-16 18:22:47 +01001195for archive processing.
Akron11c80302016-03-18 19:44:43 +01001196Defaults to C<0> (everything runs in a single process).
Akron9ec88872017-04-12 16:29:06 +02001197
1198If C<sequential-extraction> is not set to false, this will
1199also apply to extraction.
1200
Akronc11f7982017-02-21 21:20:14 +01001201Pass -1, and the value will be set automatically to 5
Akron0b04b312020-10-30 17:39:18 +01001202times the number of available cores, in case L<Sys::Info>
1203is available.
Akronf7ad89e2016-03-16 18:22:47 +01001204This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001205
Akronf73ffb62018-06-27 12:13:59 +02001206
Akron263274c2019-02-07 09:48:30 +01001207=item B<--koral|-k>
1208
1209Version of the output format. Supported versions are:
1210C<0> for legacy serialization, C<0.03> for serialization
1211with metadata fields as key-values on the root object,
1212C<0.4> for serialization with metadata fields as a list
1213of C<"@type":"koral:field"> objects.
1214
1215Currently defaults to C<0.03>.
1216
1217
Akron9ec88872017-04-12 16:29:06 +02001218=item B<--sequential-extraction|-se>
1219
1220Flag to indicate, if the C<jobs> value also applies to extraction.
1221Some systems may have problems with extracting multiple archives
1222to the same folder at the same time.
1223Can be flagged using C<--no-sequential-extraction> as well.
1224Defaults to C<false>.
1225
Akronf73ffb62018-06-27 12:13:59 +02001226
Akron35db6e32016-03-17 22:42:22 +01001227=item B<--meta|-m>
Akron941c1a62016-02-23 17:41:41 +01001228
Akron35db6e32016-03-17 22:42:22 +01001229Define the metadata parser to use. Defaults to C<I5>.
1230Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
1231This is I<experimental>.
Akron941c1a62016-02-23 17:41:41 +01001232
Akronf73ffb62018-06-27 12:13:59 +02001233
Akron941c1a62016-02-23 17:41:41 +01001234=item B<--gzip|-z>
1235
Akronf7ad89e2016-03-16 18:22:47 +01001236Compress the output.
1237Expects a defined C<output> file in single processing.
Akron941c1a62016-02-23 17:41:41 +01001238
Akronf73ffb62018-06-27 12:13:59 +02001239
Akron11c80302016-03-18 19:44:43 +01001240=item B<--cache|-c>
1241
1242File to mmap a cache (using L<Cache::FastMmap>).
1243Defaults to C<korapxml2krill.cache> in the calling directory.
1244
Akronf73ffb62018-06-27 12:13:59 +02001245
Akron11c80302016-03-18 19:44:43 +01001246=item B<--cache-size|-cs>
1247
1248Size of the cache. Defaults to C<50m>.
1249
Akronf73ffb62018-06-27 12:13:59 +02001250
Akron11c80302016-03-18 19:44:43 +01001251=item B<--cache-init|-ci>
1252
1253Initialize cache file.
1254Can be flagged using C<--no-cache-init> as well.
1255Defaults to C<true>.
1256
Akronf73ffb62018-06-27 12:13:59 +02001257
Akron11c80302016-03-18 19:44:43 +01001258=item B<--cache-delete|-cd>
1259
1260Delete cache file after processing.
1261Can be flagged using C<--no-cache-delete> as well.
1262Defaults to C<true>.
1263
Akronf73ffb62018-06-27 12:13:59 +02001264
Akron636aa112017-04-07 18:48:56 +02001265=item B<--config|-cfg>
1266
1267Configure the parameters of your call in a file
1268of key-value pairs with whitespace separator
1269
1270 overwrite 1
1271 token DeReKo#Structure
1272 ...
1273
1274Supported parameters are:
Akron63f20d42017-04-10 23:40:29 +02001275C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron636aa112017-04-07 18:48:56 +02001276C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron31a08cb2019-02-20 20:43:26 +01001277C<output>, C<koral>,
1278C<tempary-extract>, C<sequential-extraction>,
Akron9ec88872017-04-12 16:29:06 +02001279C<base-sentences>, C<base-paragraphs>,
1280C<base-pagebreaks>,
1281C<skip> (semicolon separated), C<sigle>
Akron636aa112017-04-07 18:48:56 +02001282(semicolon separated), C<anno> (semicolon separated).
1283
Akronf73ffb62018-06-27 12:13:59 +02001284Configuration parameters will always be overwritten by
1285passed parameters.
1286
1287
Akron81500102017-04-07 20:45:44 +02001288=item B<--temporary-extract|-te>
1289
1290Only valid for the C<archive> command.
1291
1292This will first extract all files into a
1293directory and then will archive.
1294If the directory is given as C<:temp:>,
1295a temporary directory is used.
1296This is especially useful to avoid
1297massive unzipping and potential
1298network latency.
Akron636aa112017-04-07 18:48:56 +02001299
Akronf73ffb62018-06-27 12:13:59 +02001300
Akronc93a0802019-07-11 15:48:34 +02001301=item B<--to-tar>
1302
1303Only valid for the C<archive> command.
1304
1305Writes the output into a tar archive.
1306
1307
Akrone10ad322016-02-27 10:54:26 +01001308=item B<--sigle|-sg>
1309
Akron20807582016-10-26 17:11:34 +02001310Extract the given texts.
Akrone10ad322016-02-27 10:54:26 +01001311Can be set multiple times.
Akronf7ad89e2016-03-16 18:22:47 +01001312I<Currently only supported on C<extract>.>
Akronb0c88db2016-06-29 16:33:18 +02001313Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron20807582016-10-26 17:11:34 +02001314In case the C<Text> path is omitted, the whole document will be extracted.
Akron2fd402b2016-10-27 21:26:48 +02001315On the document level, the postfix wildcard C<*> is supported.
Akrone10ad322016-02-27 10:54:26 +01001316
Akronf73ffb62018-06-27 12:13:59 +02001317
Akron941c1a62016-02-23 17:41:41 +01001318=item B<--log|-l>
1319
Akronb9c33812020-10-21 16:19:35 +02001320The L<Log::Any> log level, defaults to C<ERROR>.
Akron941c1a62016-02-23 17:41:41 +01001321
Akronf73ffb62018-06-27 12:13:59 +02001322
Akron941c1a62016-02-23 17:41:41 +01001323=item B<--help|-h>
1324
Akron42f48c12020-02-14 13:08:13 +01001325Print help information.
Akron941c1a62016-02-23 17:41:41 +01001326
Akronf73ffb62018-06-27 12:13:59 +02001327
Akron941c1a62016-02-23 17:41:41 +01001328=item B<--version|-v>
1329
1330Print version information.
1331
1332=back
1333
Akronf73ffb62018-06-27 12:13:59 +02001334
Akronc13a1702016-03-15 19:33:14 +01001335=head1 ANNOTATION SUPPORT
1336
1337L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
1338developed in the KorAP project that are part of the KorAP preprocessing pipeline.
1339The base foundry with paragraphs, sentences, and the text element are mandatory for
1340L<Krill|https://github.com/KorAP/Krill>.
1341
Akron821db3d2017-04-06 21:19:31 +02001342 Base
1343 #Paragraphs
1344 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001345
Akron821db3d2017-04-06 21:19:31 +02001346 Connexor
1347 #Morpho
1348 #Phrase
1349 #Sentences
1350 #Syntax
Akronc13a1702016-03-15 19:33:14 +01001351
Akron821db3d2017-04-06 21:19:31 +02001352 CoreNLP
1353 #Constituency
1354 #Morpho
1355 #NamedEntities
1356 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001357
Akronce125b62017-06-19 11:54:36 +02001358 CMC
1359 #Morpho
1360
Akron821db3d2017-04-06 21:19:31 +02001361 DeReKo
1362 #Structure
Akronc13a1702016-03-15 19:33:14 +01001363
Akron57510c12019-01-04 14:58:53 +01001364 DGD
1365 #Morpho
Akronc29b8e12019-12-16 14:28:09 +01001366 #Structure
Akron57510c12019-01-04 14:58:53 +01001367
Akron821db3d2017-04-06 21:19:31 +02001368 DRuKoLa
1369 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001370
Akron821db3d2017-04-06 21:19:31 +02001371 Glemm
1372 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001373
Akronea1aed52018-07-19 14:43:34 +02001374 HNC
1375 #Morpho
1376
Akron4c679192018-01-16 17:41:49 +01001377 LWC
1378 #Dependency
1379
Akron821db3d2017-04-06 21:19:31 +02001380 Malt
1381 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001382
Akron821db3d2017-04-06 21:19:31 +02001383 MarMoT
1384 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001385
Akron821db3d2017-04-06 21:19:31 +02001386 Mate
1387 #Dependency
1388 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001389
Akron821db3d2017-04-06 21:19:31 +02001390 MDParser
1391 #Dependency
Akronc13a1702016-03-15 19:33:14 +01001392
Akron821db3d2017-04-06 21:19:31 +02001393 OpenNLP
1394 #Morpho
1395 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001396
Akron07e24772020-04-23 14:00:54 +02001397 RWK
1398 #Morpho
1399 #Structure
1400
Akron821db3d2017-04-06 21:19:31 +02001401 Sgbr
1402 #Lemma
1403 #Morpho
Akronc13a1702016-03-15 19:33:14 +01001404
Akron7d5e6382019-08-08 16:36:27 +02001405 Talismane
1406 #Dependency
1407 #Morpho
1408
Akron821db3d2017-04-06 21:19:31 +02001409 TreeTagger
1410 #Morpho
1411 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001412
Akron821db3d2017-04-06 21:19:31 +02001413 XIP
1414 #Constituency
1415 #Morpho
1416 #Sentences
Akronc13a1702016-03-15 19:33:14 +01001417
Akronc13a1702016-03-15 19:33:14 +01001418
1419More importers are in preparation.
1420New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
1421See the built-in annotation importers as examples.
1422
Akronf73ffb62018-06-27 12:13:59 +02001423
Akron8f69d632020-01-15 16:58:11 +01001424=head1 About KorAP-XML
1425
1426KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
1427data model (Bański et al. 2013), where text data are stored physically
1428separated from their interpretations (i.e. annotations).
1429A text document in KorAP-XML therefore consists of several files
1430containing primary data, metadata and annotations.
1431
1432The structure of a single KorAP-XML document can be as follows:
1433
1434 - data.xml
1435 - header.xml
1436 + base
1437 - tokens.xml
1438 - ...
1439 + struct
1440 - structure.xml
1441 - ...
1442 + corenlp
1443 - morpho.xml
1444 - constituency.xml
1445 - ...
1446 + tree_tagger
1447 - morpho.xml
1448 - ...
1449 - ...
1450
1451The C<data.xml> contains the primary data, the C<header.xml> contains
1452the metadata, and the annotation layers are stored in subfolders
1453like C<base>, C<struct> or C<corenlp>
1454(so-called "foundries"; Bański et al. 2013).
1455
1456Metadata is available in the TEI-P5 variant I5
Akrond4c5c102020-02-11 11:47:59 +01001457(Lüngen and Sperberg-McQueen 2012). See the documentation in
1458L<KorAP::XML::Meta::I5> for translatable fields.
1459
1460Annotations correspond to a variant of the TEI-P5 feature structures
1461(TEI Consortium; Lee et al. 2004).
Akron72bc5222020-02-06 16:00:13 +01001462Annotation feature structures refer to character sequences of the primary text
1463inside the C<text> element of the C<data.xml>.
1464A single annotation containing the lemma of a token can have the following structure:
1465
1466 <span from="0" to="3">
1467 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
1468 <f name="lex">
1469 <fs>
1470 <f name="lemma">zum</f>
1471 </fs>
1472 </f>
1473 </fs>
1474 </span>
1475
1476The C<from> and C<to> attributes are refering to the character span
1477in the primary text.
1478Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
1479the structure may vary. See L<KorAP::XML::Annotation::*> for various
1480annotation preprocessors.
Akron8f69d632020-01-15 16:58:11 +01001481
1482Multiple KorAP-XML documents are organized on three levels following
1483the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
1484corpus E<gt> document E<gt> text. On each level metadata information
1485can be stored, that C<korapxml2krill> will merge to a single metadata
1486object per text. A corpus is therefore structured as follows:
1487
1488 + <corpus>
1489 - header.xml
1490 + <document>
1491 - header.xml
1492 + <text>
1493 - data.xml
1494 - header.xml
1495 - ...
1496 - ...
1497
1498A single text can be identified by the concatenation of
1499the corpus identifier, the document identifier and the text identifier.
1500This identifier is called the text sigle
1501(e.g. a text with the identifier C<18486> in the document C<060> in the
1502corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
1503
1504These corpora are often stored in zip files, with which C<korapxml2krill>
1505can deal with. Corpora may also be split in multiple zip archives
1506(e.g. one zip file per foundry), which is also supported (see C<--input>).
1507
1508Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
1509in form of a test suite.
1510The resulting JSON format merges all annotation layers
1511based on a single token stream.
1512
1513=head2 References
1514
1515Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
1516KorAP data model: first approximation, December.
1517
1518Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
1519"The New IDS Corpus Analysis Platform: Challenges and Prospects",
1520Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
1521L<PDF|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
1522
1523Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
1524"Robust corpus architecture: a new look at virtual collections and data access",
1525Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
1526L<PDF|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
1527
1528Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
1529Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
1530"Towards an international standard on featurestructure representation",
1531Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
1532pp. 373-376.
1533L<PDF|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
1534
1535Harald Lüngen and C. M. Sperberg-McQueen (2012):
1536"A TEI P5 Document Grammar for the IDS Text Model",
1537Journal of the Text Encoding Initiative, Issue 3 | November 2012.
1538L<PDF|https://journals.openedition.org/jtei/pdf/508>
1539
1540TEI Consortium, eds:
1541"Feature Structures",
1542Guidelines for Electronic Text Encoding and Interchange.
1543L<html|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
1544
Akron941c1a62016-02-23 17:41:41 +01001545=head1 AVAILABILITY
1546
1547 https://github.com/KorAP/KorAP-XML-Krill
1548
1549
1550=head1 COPYRIGHT AND LICENSE
1551
Akron8f69d632020-01-15 16:58:11 +01001552Copyright (C) 2015-2020, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akronf7ad89e2016-03-16 18:22:47 +01001553
Akron8f69d632020-01-15 16:58:11 +01001554Author: L<Nils Diewald|https://nils-diewald.de/>
Akron81500102017-04-07 20:45:44 +02001555
Akrona76d8352016-10-27 16:27:32 +02001556Contributor: Eliza Margaretha
Akron941c1a62016-02-23 17:41:41 +01001557
1558L<KorAP::XML::Krill> is developed as part of the L<KorAP|http://korap.ids-mannheim.de/>
1559Corpus Analysis Platform at the
Akron94262ce2019-02-28 21:42:43 +01001560L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
Akron941c1a62016-02-23 17:41:41 +01001561member of the
Akronf1849aa2019-12-16 23:35:33 +01001562L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
Akron941c1a62016-02-23 17:41:41 +01001563
1564This program is free software published under the
1565L<BSD-2 License|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
1566
1567=cut