Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: cb1863c41374d89e25f40e4ea812ae0b545938d8 [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	4	use v5.10;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	5	use FindBin;
				6	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				7	use File::Spec::Functions qw/catfile catdir/;
				8	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	9	use Benchmark qw/:hireswallclock/;
				10	use IO::Compress::Gzip qw/$GzipError/;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	11	use POSIX qw/ceil/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	12	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	13	use Pod::Usage;
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	14	use Cache::FastMmap;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	15	use Directory::Iterator;
Akron	41127e3	2020-08-07 12:46:19 +0200	[diff] [blame]	16	use KorAP::XML::Krill qw!get_file_name get_file_name_from_glob!;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	17	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	18	use KorAP::XML::Tokenizer;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	19	use KorAP::XML::Batch::File;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	20	use Config::Simple;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	21	use Parallel::ForkManager;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	22	use Sys::Info;
				23	use Sys::Info::Constants qw( :device_cpu );
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	24	use File::Glob ':bsd_glob';
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	25	use File::Temp qw/tempdir/;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	26	use File::Path qw(remove_tree make_path);
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	27	use File::Basename;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	28	use Mojo::Collection 'c';
				29	use String::Random qw(random_string);
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	30	use IO::File;
				31	use Archive::Tar::Builder;
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	32	use Fcntl qw(:flock SEEK_END);
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	33
				34	# use KorAP::XML::ForkPool;
Akron	75ba57d	2016-03-07 23:36:27 +0100	[diff] [blame]	35	# TODO: use Parallel::Loops
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	36	# TODO: make output files
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	37
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	38	# TODO: Use KorAP::XML::ForkPool!
				39
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	40	# CHANGES:
				41	# ----------------------------------------------------------
				42	# 2013/11/25
				43	# - Initial release
				44	#
				45	# 2014/10/29
				46	# - Merges foundry data to create indexer friendly documents
				47	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	48	# 2016/02/04
				49	# - renamed to korapxml2krill
				50	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	51	#
				52	# 2016/02/12
				53	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	54	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	55	#
				56	# 2016/02/14
				57	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	58	# - Added support for archive files
				59	#
				60	# 2016/02/15
				61	# - Fixed temporary directory bug
				62	# - Improved skipping before unzipping
				63	# - Added EXPERIMENTAL concurrency support
				64	#
				65	# 2016/02/23
				66	# - Merge korapxml2krill and korapxml2krill_dir
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	67	#
				68	# 2016/02/27
				69	# - Added extract function
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	70	#
				71	# 2016/03/17
				72	# - Added meta switch
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	73	#
				74	# 2016/03/18
				75	# - Added meta data caching
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	76	#
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	77	# 2016/06/27
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	78	# - Added multi archive support
				79	# - Added prefix negation support
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	80	# - Added Malt#Dependency support
Akron	8b99052	2016-07-06 16:45:57 +0200	[diff] [blame]	81	#
				82	# 2016/07/06
				83	# - Added MDParser#Dependency
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	84	#
				85	# 2016/10/15
Nils Diewald	0e48977	2016-10-24 15:16:52 +0200	[diff] [blame]	86	# - Fixed temporary path issue in script
				87	#
				88	# 2016/10/24
				89	# - Improved Windows support
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	90	#
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	91	# 2016/10/24
				92	# - Added support for document extraction
				93	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	94	# 2016/10/27
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	95	# - Added wildcard support for document extraction
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	96	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	97	# 2016/12/21
				98	# - added support for base-sentences and base-tokenizations
				99	#
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	100	# 2017/01/20
				101	# - added support for DRuKoLa annotations
				102	#
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	103	# 2017/02/08
				104	# - added support for pagebreak annotations
				105	#
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	106	# 2017/04/06
				107	# - added support for wildcards in input
				108	#
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	109	# 2017/04/07
				110	# - support configuration option
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	111	# - support for temporary extraction
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	112	#
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	113	# 2017/04/12
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	114	# - support serial processing
				115	# - support input root
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	116	# - introduced --sequential-extraction flag
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	117	#
				118	# 2017/06/19
				119	# - added support for DCK
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	120	#
				121	# 2017/06/29
				122	# - Fixed exit codes
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	123	#
				124	# 2017/07/04
				125	# - Fixed tar building process
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	126	#
				127	# 2018/01/16
				128	# - Added LWC support
Akron	5fdc7e1	2018-07-19 12:37:48 +0200	[diff] [blame]	129	#
				130	# 2018/07/19
				131	# - Preliminary support for HNC.
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	132	#
				133	# 2019/01/22
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	134	# - Preliminary support for DGD.
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	135	# - Support for non-word tokens.
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	136	#
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	137	# 2019/02/13
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	138	# - Support for 'koral:field' array.
				139	# - Support for Koral versioning.
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	140	# - Ignore temporary extract parameter on
				141	# directory archiving.
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	142	#
				143	# 2019/08/08
				144	# - Support for Talismane.
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	145	#
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	146	# 2019/12/17
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	147	# - Added support for DGD pseudo-sentences
				148	# based on anchor milestones.
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	149	# - Support for non-verbal annotations.
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame]	150	#
				151	# 2020/04/23
				152	# - Added support for Redewiedergabe-Korpus structure
				153	# annotations, based on sentence and paragraph milestones
				154	# - Added support for Redewiedergabe-Korpus morphology
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	155	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	156
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	157	our $LAST_CHANGE = '2020/08/07';
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	158	our $LOCAL = $FindBin::Bin;
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	159	our $KORAL_VERSION = 0.03;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	160	our $VERSION_MSG = <<"VERSION";
				161	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				162	VERSION
				163
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	164	# Parse comand
				165	my $cmd;
				166	our @ARGV;
				167	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				168	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	169	};
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	170	my @keep_argv = @ARGV;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	171
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	172	my (@skip, @sigle, @anno, @input);
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	173
				174	# Configuration hash
				175	my %cfg = ();
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	176
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	177	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	178	GetOptions(
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	179	'input\|i=s' => \@input,
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	180	'input-base\|ib=s' => \($cfg{input_base}),
				181	'output\|o=s' => \($cfg{output}),
				182	'overwrite\|w' => \($cfg{overwrite}),
				183	'meta\|m=s' => \($cfg{meta}),
				184	'token\|t=s' => \($cfg{token}),
				185	'base-sentences\|bs=s' => \($cfg{base_sentences}),
				186	'base-paragraphs\|bp=s' => \($cfg{base_paragraphs}),
				187	'base-pagebreaks\|bpb=s' => \($cfg{base_pagebreaks}),
				188	'gzip\|z' => \($cfg{gzip}),
				189	'temporary-extract\|te=s' => \($cfg{extract_dir}),
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	190	'skip\|s=s' => \@skip,
				191	'sigle\|sg=s' => \@sigle,
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	192	'cache\|c=s' => \($cfg{cache_file}),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	193	'config\|cfg=s' => \(my $cfg_file),
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	194	'log\|l=s' => \($cfg{log}),
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	195	'anno\|a=s' => \@anno,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	196	'primary\|p!' => \(my $primary),
				197	'pretty\|y' => \(my $pretty),
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	198	'jobs\|j=i' => \($cfg{jobs}),
				199	'koral\|k=f' => \($cfg{koral}),
				200	'to-tar' => \($cfg{to_tar}),
				201	'non-word-tokens\|nwt' => \($cfg{non_word_tokens}),
				202	'non-verbal-tokens\|nvt' => \($cfg{non_verbal_tokens}),
				203	'sequential-extraction\|se' => \($cfg{sequential_extraction}),
				204	'cache-size\|cs=s' => \($cfg{cache_size}),
				205	'cache-delete\|cd!' => \($cfg{cache_delete}),
				206	'cache-init\|ci!' => \($cfg{cache_init}),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	207	'help\|h' => sub {
				208	pod2usage(
				209	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	210	-verbose => 99,
				211	-msg => $VERSION_MSG,
				212	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	213	);
				214	},
				215	'version\|v' => sub {
				216	pod2usage(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	217	-verbose => 0,
				218	-msg => $VERSION_MSG,
				219	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	220	)
				221	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	222	);
				223
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	224
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	225	# Load from configuration and fill non-given data
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	226	if ($cfg_file && -e $cfg_file) {
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	227	my %config;
				228
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	229	print "Reading config from $cfg_file\n";
				230
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	231	Config::Simple->import_from($cfg_file, \%config);
				232
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	233	foreach (qw!output cache-size input-base token overwrite
				234	meta base-sentences base-paragraphs base-pagebreaks
				235	gzip to-tar log cache non-word-tokens
				236	non-verbal-tokens sequential-extraction cache-init
				237	koral extract-dir jobs!) {
				238	my $underlined = $_ =~ tr/-/_/r;
				239	if (!defined($cfg{$underlined}) && defined $config{$_}) {
				240	$cfg{$underlined} = $config{$_};
				241	};
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	242	};
				243
				244	# Skip
				245	if (!scalar(@skip) && defined $config{'skip'}) {
				246	@skip = split /\s;\s/, $config{'skip'} ;
				247	};
				248
				249	# Sigle
				250	if (!scalar(@sigle) && defined $config{'sigle'}) {
				251	@sigle = split /\s;\s/, $config{'sigle'} ;
				252	};
				253
				254	# Anno
				255	if (!scalar(@anno) && defined $config{'anno'}) {
				256	@anno = split /\s;\s/, $config{'anno'} ;
				257	};
				258	};
				259
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	260	# Init variables and set default values
				261	my $output = $cfg{output};
				262	my $input_base = $cfg{input_base};
				263	my $gzip = $cfg{gzip};
				264	my $to_tar = $cfg{to_tar};
				265	my $extract_dir = $cfg{extract_dir};
				266	my $token_base = $cfg{token} // 'OpenNLP#tokens';
				267	my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
				268	my $jobs = $cfg{jobs} // 0;
				269	my $cache_delete = $cfg{cache_delete} // 1;
				270	my $base_sentences = lc($cfg{base_sentences} // '');
				271	my $base_paragraphs = lc($cfg{base_paragraphs} // '');
				272	my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
				273	my $sequential_extraction = $cfg{sequential_extraction} // 0;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	274
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	275	# Get tokenization basis
				276	my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	277
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	278	# Remove file extension
				279	$token_base_layer =~ s/\.xml$//i;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	280
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	281	# Convert sigle to path construct
				282	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;
				283
				284	my %skip;
				285	$skip{lc($_)} = 1 foreach @skip;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	286
				287	# Initialize log4perl object
				288	Log::Log4perl->init({
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	289	'log4perl.rootLogger' => uc($cfg{log} // 'ERROR') . ', STDERR',
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	290	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				291	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				292	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				293	});
				294
				295	my $log = Log::Log4perl->get_logger('main');
				296
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	297	if ($cmd && $output && (!defined($to_tar)) && (!-e $output \|\| !-d $output)) {
				298	$log->error("Directory '$output' does not exist.");
				299	exit 1;
				300	};
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	301
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	302	my %ERROR_HASH = (
				303	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	304	-verbose => 99,
				305	-msg => $VERSION_MSG,
				306	-output => '-',
				307	-exit => 1
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	308	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	309
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	310	# Input has to be defined
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	311	pod2usage(%ERROR_HASH) unless @input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	312
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	313	# Gzip has no effect, if no output is given
				314	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	315
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	316
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	317	# Auto adjust jobs
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	318	if ($jobs eq '-1') {
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	319	my $cores = Sys::Info->new->device('CPU')->count;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	320	$jobs = ceil(5 * $cores);
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	321	$log->info("Run using $jobs jobs on $cores cores");
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	322	};
				323
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	324
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	325	# Start serial processing
Akron	28c4e54	2017-07-04 20:30:33 +0200	[diff] [blame]	326	if ($cmd && $cmd eq 'serial') {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	327
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	328	if ($output && (!defined($to_tar)) && (!-e $output \|\| !-d $output)) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	329	$log->error("Directory '$output' does not exist.");
				330	exit 1;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	331	};
				332
				333	# Remove all inputs
				334	my $remove_next = 0;
				335	@keep_argv = @{c(@keep_argv)->grep(
				336	sub {
				337	# Input flag
				338	if ($_ eq '-i' \|\| $_ eq '--input' \|\| $_ eq '--output' \|\| $_ eq '-o') {
				339	$remove_next = 1;
				340	return 0;
				341	}
				342
				343	# input value
				344	elsif ($remove_next) {
				345	$remove_next = 0;
				346	return 0;
				347	};
				348
				349	# Pass parameter
				350	return 1;
				351	}
				352	)->to_array};
				353
				354
				355	# Iterate over all inputs
				356	foreach (@input) {
				357
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	358	# This will create a directory
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	359	my $new_out = catdir($output, get_file_name_from_glob($_));
				360
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	361	# Create new path, in case the output is not meant to be tarred
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	362	unless ($to_tar) {
				363	if (make_path($new_out) == 0 && !-d $new_out) {
				364	$log->error("Can\'t create path $new_out");
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	365	exit 1;
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	366	};
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	367	};
				368
				369	# Create archive command
				370	my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
				371	print "Start serial processing of $_ to $new_out\n";
				372
				373	# Start archiving
				374	system @archive_cmd;
				375	};
				376
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	377	exit;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	378	};
				379
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	380	my @layers;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	381	push(@layers, ['Base', 'Sentences']) unless $base_sentences;
				382	push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	383
				384	# Connexor
				385	push(@layers, ['Connexor', 'Morpho']);
				386	push(@layers, ['Connexor', 'Syntax']);
				387	push(@layers, ['Connexor', 'Phrase']);
				388	push(@layers, ['Connexor', 'Sentences']);
				389
				390	# CoreNLP
				391	push(@layers, ['CoreNLP', 'NamedEntities']);
				392	push(@layers, ['CoreNLP', 'Sentences']);
				393	push(@layers, ['CoreNLP', 'Morpho']);
				394	push(@layers, ['CoreNLP', 'Constituency']);
				395
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	396	# CMC
				397	push(@layers, ['CMC', 'Morpho']);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	398
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	399	# DeReKo
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	400	my @dereko_attr = ();
				401	if ($base_sentences eq 'dereko#structure') {
				402	push @dereko_attr, 'sentences';
				403	};
				404	if ($base_paragraphs eq 'dereko#structure') {
				405	push @dereko_attr, 'paragraphs';
				406	};
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	407
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	408	if ($base_pagebreaks eq 'dereko#structure') {
				409	push @dereko_attr, 'pagebreaks';
				410	};
				411
				412	if ($dereko_attr[0]) {
				413	push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	414	}
				415	else {
				416	push(@layers, ['DeReKo', 'Structure']);
				417	};
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	418
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	419	# DGD
				420	push(@layers, ['DGD', 'Morpho']);
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	421	if ($base_sentences eq 'dgd#structure') {
				422	push(@layers, ['DGD', 'Structure', 'base-sentence']);
				423	}
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	424
				425	# DRuKoLa
				426	push(@layers, ['DRuKoLa', 'Morpho']);
				427
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	428	# Glemm
				429	push(@layers, ['Glemm', 'Morpho']);
				430
Akron	ea1aed5	2018-07-19 14:43:34 +0200	[diff] [blame]	431	# HNC
				432	push(@layers, ['HNC', 'Morpho']);
				433
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	434	# LWC
				435	push(@layers, ['LWC', 'Dependency']);
				436
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	437	# Malt
				438	push(@layers, ['Malt', 'Dependency']);
				439
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	440	# Marmot
				441	push(@layers, ['MarMoT', 'Morpho']);
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	442
				443	# Mate
				444	push(@layers, ['Mate', 'Morpho']);
				445	push(@layers, ['Mate', 'Dependency']);
				446
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	447	# MDParser
				448	push(@layers, ['MDParser', 'Dependency']);
				449
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	450	# OpenNLP
				451	push(@layers, ['OpenNLP', 'Morpho']);
				452	push(@layers, ['OpenNLP', 'Sentences']);
				453
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame]	454	# Redewiedergabe
				455	push(@layers, ['RWK', 'Morpho']);
				456	if ($base_sentences eq 'rwk#structure') {
				457	push(@layers, ['RWK', 'Structure']);
				458	};
				459
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	460	# Schreibgebrauch
				461	push(@layers, ['Sgbr', 'Lemma']);
				462	push(@layers, ['Sgbr', 'Morpho']);
				463
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	464	# Talismane
				465	push(@layers, ['Talismane', 'Dependency']);
				466	push(@layers, ['Talismane', 'Morpho']);
				467
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	468	# TreeTagger
				469	push(@layers, ['TreeTagger', 'Morpho']);
				470	push(@layers, ['TreeTagger', 'Sentences']);
				471
				472	# XIP
				473	push(@layers, ['XIP', 'Morpho']);
				474	push(@layers, ['XIP', 'Constituency']);
				475	push(@layers, ['XIP', 'Sentences']);
				476	push(@layers, ['XIP', 'Dependency']);
				477
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	478
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	479	# Check filters
				480	my @filtered_anno;
				481	if ($skip{'#all'}) {
				482	foreach (@anno) {
				483	push @filtered_anno, [ split('#', $_) ];
				484	};
				485	}
				486
				487	# Add all annotations that are not skipped
				488	else {
				489	# Add to index file - respect skipping
				490	foreach my $info (@layers) {
				491	# Skip if Foundry or Foundry#Layer should be skipped
				492	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				493	push @filtered_anno, $info;
				494	};
				495	};
				496	};
				497
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	498
				499	# TODO: This should not be initialized for batch
				500	my $cache = Cache::FastMmap->new(
				501	share_file => $cache_file,
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	502	cache_size => ($cfg{cache_size} // '50m'),
				503	init_file => ($cfg{cache_init} // 1)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	504	);
				505
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	506	# Create batch object
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	507	my $batch_file = KorAP::XML::Batch::File->new(
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	508	cache => $cache,
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	509	meta_type => $cfg{meta},
				510	overwrite => $cfg{overwrite},
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	511	foundry => $token_base_foundry,
				512	layer => $token_base_layer,
				513	gzip => $gzip,
				514	log => $log,
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	515	koral => ($cfg{koral} // $KORAL_VERSION),
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	516	primary => $primary,
				517	pretty => $pretty,
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	518	anno => \@filtered_anno,
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	519	non_word_tokens => ($cfg{non_word_tokens} // 0),
				520	non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	521	);
				522
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	523	# Glob and prefix files
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	524	if (@input) {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	525
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	526	my @new_input = ();
				527
				528	# Iterate over all inputs
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	529	foreach my $wild_card (@input) {
				530
				531	# Prefix with input root
				532	$wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
				533
				534	push (@new_input, bsd_glob($wild_card));
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	535	};
				536
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	537	# Sort files by length
				538	@input = sort { length($a) <=> length($b) } @new_input;
				539
				540	print 'Input is ' . join(', ', @input)."\n";
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	541	};
				542
				543
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	544	# Process a single file
				545	unless ($cmd) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	546	my $input = $input[0];
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	547
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	548	BEGIN {
				549	$main::TIME = Benchmark->new;
				550	$main::LAST_STOP = Benchmark->new;
				551	};
				552
				553	sub stop_time {
				554	my $new = Benchmark->new;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	555	$log->info(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	556	'The code took: '.
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	557	timestr(timediff($new, $main::LAST_STOP)) .
				558	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				559	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	560	$main::LAST_STOP = $new;
				561	};
				562
				563	# Create and parse new document
				564	$input =~ s{([^/])$}{$1/};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	565
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	566	# Process file
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	567	$batch_file->process($input, $output);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	568
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	569	# Delete cache file
				570	unlink($cache_file) if $cache_delete;
				571
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	572	stop_time;
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	573	exit;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	574	};
				575
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	576
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	577	# Extract XML files
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	578	if ($cmd eq 'extract') {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	579
Akron	d5643ad	2017-07-04 20:27:13 +0200	[diff] [blame]	580	# Output is required
				581	pod2usage(%ERROR_HASH) unless $output;
				582
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	583	# Create new archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	584	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	585
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	586	# Check zip capabilities
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	587	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	588	$log->error("Unzip is not installed or incompatible.");
				589	exit 1;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	590	};
				591
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	592	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	593	$archive->attach($_) foreach @input[1..$#input];
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	594
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	595	# Will set @sigle
				596	my $prefix = set_sigle($archive);
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	597
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	598	# Iterate over all given sigles and extract
				599	foreach (@sigle) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	600
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	601	print "$_ ...\n";
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	602
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	603	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	604	print '... ' . (
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	605
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	606	# TODO:
				607	# - prefix???
				608	$archive->extract_sigle([$_], $output, $jobs)
				609	? '' : 'not '
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	610	);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	611	print "extracted.\n";
				612	};
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	613	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	614
				615	# Can't create archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	616	else {
				617	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	618	exit 1;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	619	};
				620	}
				621
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	622
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	623	# Process an archive
				624	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	625
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	626	my $archive_output;
				627
				628	# First extract, then archive
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	629	if (defined $extract_dir && !-d $input[0]) {
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	630
				631	# Create new archive object
				632	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
				633
				634	# Check zip capabilities
				635	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	636	$log->error("Unzip is not installed or incompatible.");
				637	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	638	};
				639
				640	# Add further annotation archived
				641	$archive->attach($_) foreach @input[1..$#input];
				642
				643	# Create a temporary directory
				644	if ($extract_dir eq ':temp:') {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	645	$extract_dir = tempdir(CLEANUP => 0);
				646	print "Temporarily extract to $extract_dir\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	647	};
				648
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	649	# Add some random extra to avoid clashes with multiple archives
				650	$extract_dir = catdir($extract_dir, random_string('cccccc'));
				651
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	652	# Extract to temporary directory
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	653	if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	654	@input = ($extract_dir);
				655	}
				656	else {
				657	$log->error('Unable to extract from primary archive ' . $input[0] .
				658	' to ' . $extract_dir);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	659	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	660	};
				661	}
				662
				663	# Can't create archive object
				664	else {
				665	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	666	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	667	};
				668	};
				669
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	670	# Zero means: everything runs in the parent process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	671	my $pool = Parallel::ForkManager->new($jobs);
				672
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	673	my $count = 0; # Texts to process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	674	my $iter = 1; # Current text in process
				675
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	676	my $tar_archive;
				677	my $output_dir = $output;
				678	my $tar_fh;
				679
				680	# Initialize tar archive
				681	if ($to_tar) {
				682	$tar_archive = Archive::Tar::Builder->new(
				683	ignore_errors => 1
				684	);
				685
				686	# Set output name
				687	my $tar_file = $output;
				688	unless ($tar_file =~ /\.tar$/) {
				689	$tar_file .= '.tar';
				690	};
				691
				692	# Initiate the tar file
				693	print "Writing to file $tar_file\n";
				694	$tar_fh = IO::File->new($tar_file, 'w');
				695	$tar_fh->binmode(1);
				696
				697	# Set handle
				698	$tar_archive->set_handle($tar_fh);
				699
				700	# Output to temporary directory
				701	$output_dir = File::Temp->newdir;
				702	};
				703
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	704	# Report on fork message
				705	$pool->run_on_finish (
				706	sub {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	707	my ($pid, $code) = @_;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	708	my $data = pop;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	709
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	710	print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	711	($iter++) . "/$count]" .
				712	($code ? " $code" : '') .
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	713	' ' . $data->[0] . "\n";
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	714
				715	if (!$code && $to_tar && $data->[2]) {
				716	my $filename = $data->[2];
				717
				718	# Lock filehandle
				719	if (flock($tar_fh, LOCK_EX)) {
				720
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	721	my $clean_file = fileparse($filename);
				722
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	723	# Archive and remove file
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	724	$tar_archive->archive_as($filename => $clean_file);
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	725	unlink $filename;
				726
				727	# Unlock filehandle
				728	flock($tar_fh, LOCK_UN);
				729	}
				730	else {
				731	$log->warn("Unable to add $filename to archive");
				732	};
				733	};
				734
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	735	$data->[1] = undef if $data->[1];
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	736	}
				737	);
				738
				739	my $t;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	740	my $temp;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	741	print "Reading data ...\n";
				742
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	743	# unless (Cache::FastMmap->new(
				744	# share_file => $cache_file,
				745	# cache_size => $cache_size,
				746	# init_file => $cache_init
				747	# )) {
				748	# print "Unable to intialize cache '$cache_file'\n\n";
				749	# exit(1);
				750	# };
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	751
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	752
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	753	# Input is a directory
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	754	if (-d $input[0]) {
				755	my $it = Directory::Iterator->new($input[0]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	756	my @dirs;
				757	my $dir;
				758
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	759	# Todo: Make a DO WHILE
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	760	while (1) {
				761	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	762	push @dirs, $dir;
				763	$it->prune;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	764	};
				765	last unless $it->next;
				766	};
				767
				768	print "Start processing ...\n";
				769	$t = Benchmark->new;
				770	$count = scalar @dirs;
				771
				772	DIRECTORY_LOOP:
				773	for (my $i = 0; $i < $count; $i++) {
				774
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	775	my $filename = catfile(
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	776	$output_dir,
Akron	41127e3	2020-08-07 12:46:19 +0200	[diff] [blame]	777	get_file_name($input[0], $dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	778	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	779
				780	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	781	$pool->start and next DIRECTORY_LOOP;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	782
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	783	if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	784	$pool->finish(
				785	0,
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	786	[
				787	"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
				788	undef,
				789	$filename
				790	]
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	791	);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	792	}
				793	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	794	$pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	795	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	796	};
				797	}
				798
				799	# Input is a file
Akron	29866ac	2016-06-24 16:40:47 +0200	[diff] [blame]	800	elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	801
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	802	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	803	$log->error("Unzip is not installed or incompatible.");
				804	exit 1;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	805	};
				806
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	807	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	808	$archive->attach($_) foreach @input[1..$#input];
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	809
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	810	# Get sigles to extract
				811	my $prefix = set_sigle($archive);
				812
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	813	print "Start processing ...\n";
				814	$t = Benchmark->new;
				815	my @dirs = $archive->list_texts;
				816	$count = scalar @dirs;
				817
				818	ARCHIVE_LOOP:
				819	for (my $i = 0; $i < $count; $i++) {
				820
				821	# Split path information
				822	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				823
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	824	my $filename = catfile(
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	825	$output_dir,
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	826	get_file_name(
Akron	41127e3	2020-08-07 12:46:19 +0200	[diff] [blame]	827	$input[0],
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	828	catfile($corpus, $doc, $text)
				829	. '.json' . ($gzip ? '.gz' : '')
				830	)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	831	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	832
				833	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	834	$pool->start and next ARCHIVE_LOOP;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	835
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	836	# Create temporary file
				837	$temp = File::Temp->newdir;
				838
Akron	bdf434a	2016-10-24 17:42:07 +0200	[diff] [blame]	839	# TODO: Check if $filename exist at the beginning,
				840	# because extraction can be horrible slow!
				841
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	842	# Extract from archive
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	843	if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	844
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	845	# Create corpus directory
				846	my $input = catdir("$temp", $corpus);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	847
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	848	# Temporary directory
				849	my $dir = catdir($input, $doc, $text);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	850
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	851	# Write file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	852	if (my $return = $batch_file->process($dir => $filename)) {
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	853
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	854	# Delete temporary file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	855	$pool->finish(
				856	0,
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	857	[
				858	"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
				859	$temp,
				860	$filename
				861	]
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	862	);
				863	#$pool->finish(0, ["Processed " . $filename, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	864	}
				865	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	866	# Delete temporary file
				867	$pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	868	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	869	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	870
				871	# Unable to extract
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	872	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	873	$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	874	};
				875	};
				876	}
				877
				878	else {
				879	print "Input is neither a directory nor an archive.\n\n";
				880	};
				881
				882	$pool->wait_all_children;
				883
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	884	# Delete cache file
				885	unlink($cache_file) if $cache_delete;
				886
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	887	# Close tar filehandle
				888	if ($to_tar && $tar_fh) {
				889	$tar_archive->finish;
				890	$tar_fh->close;
				891	print "Wrote to tar archive.\n";
				892	};
				893
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	894	print timestr(timediff(Benchmark->new, $t))."\n";
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	895	print "Done.\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	896	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	897
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	898
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	899	# For an archive, this will create the list
				900	# of all sigles to process
				901	sub set_sigle {
				902	my $archive = shift;
				903
				904	my $prefix = 1;
				905	my @dirs = ();
				906
				907	# No sigles given
				908	unless (@sigle) {
				909
				910	# Get files
				911	foreach ($archive->list_texts) {
				912
				913	push @dirs, $_;
				914
				915	# Split path information
				916	($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
				917
				918	# TODO: Make this OS independent
				919	push @sigle, join '/', $corpus, $doc, $text;
				920	};
				921	}
				922
				923	# Check sigle for doc sigles
				924	else {
				925	my @new_sigle;
				926
				927	my $prefix_check = 0;
				928
				929	# Iterate over all sigle
				930	foreach (@sigle) {
				931
				932	# Sigle is a doc sigle
				933	if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
				934
				935	print "$_ ...";
				936	# Check if a prefix is needed
				937	unless ($prefix_check) {
				938
				939	if ($prefix = $archive->check_prefix) {
				940	print " with prefix ...";
				941	};
				942	$prefix_check = 1;
				943	};
				944
				945	print "\n";
				946
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	947	print '... ' . (
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	948	$archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
				949	? '' : 'not '
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	950	);
				951	print "extracted.\n";
				952	}
				953
				954	# Sigle is a text sigle
				955	else {
				956	push @new_sigle, $_;
				957
				958	unless ($prefix_check) {
				959
				960	if ($prefix = $archive->check_prefix) {
				961	print " with prefix ...";
				962	};
				963	$prefix_check = 1;
				964	};
				965	};
				966	};
				967	@sigle = @new_sigle;
				968	};
				969
				970	return $prefix;
				971	};
				972
				973
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	974	# Cleanup temporary extraction directory
				975	if ($extract_dir) {
				976	my $objects = remove_tree($extract_dir, { safe => 1 });
Akron	f8df216	2020-08-07 15:03:39 +0200	[diff] [blame^]	977	$log->info("Removed directory $extract_dir with $objects objects");
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	978	};
				979
				980
				981	print "\n";
				982
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	983	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	984
				985	=pod
				986
				987	=encoding utf8
				988
				989	=head1 NAME
				990
Akron	42f48c1	2020-02-14 13:08:13 +0100	[diff] [blame]	991	korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	992
				993
				994	=head1 SYNOPSIS
				995
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	996	korapxml2krill [archive\|extract] --input <directory\|archive> [options]
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	997
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	998
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	999	=head1 DESCRIPTION
				1000
				1001	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				1002	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1003	The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1004
				1005
				1006	=head1 INSTALLATION
				1007
				1008	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				1009
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	1010	$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1011
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1012	In case everything went well, the C<korapxml2krill> tool will
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1013	be available on your command line immediately.
Akron	6eff23b	2018-09-24 10:31:20 +0200	[diff] [blame]	1014	Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	1015	In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1016
				1017	=head1 ARGUMENTS
				1018
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1019	$ korapxml2krill -z --input <directory> --output <filename>
				1020
				1021	Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1022	It expects the input to point to the text level folder.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1023
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1024	=over 2
				1025
				1026	=item B<archive>
				1027
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	1028	$ korapxml2krill archive -z --input <directory\|archive> --output <directory\|tar>
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1029
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1030	Converts an archive of KorAP-XML documents. It expects a directory
				1031	(pointing to the corpus level folder) or one or more zip files as input.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1032
				1033	=item B<extract>
				1034
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1035	$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
				1036
				1037	Extracts KorAP-XML documents from a zip file.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1038
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1039	=item B<serial>
				1040
				1041	$ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
				1042
				1043	Convert archives sequentially. The inputs are not merged but treated
				1044	as they are (so they may be premerged or globs).
				1045	the C<--out> directory is treated as the base directory where subdirectories
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	1046	are created based on the archive name. In case the C<--to-tar> flag is given,
				1047	the output will be a tar file.
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1048
				1049
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1050	=back
				1051
				1052
				1053	=head1 OPTIONS
				1054
				1055	=over 2
				1056
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1057	=item B<--input\|-i> <directory\|zip file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1058
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1059	Directory or zip file(s) of documents to convert.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1060
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1061	Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akron	f1a1de9	2016-11-02 17:32:12 +0100	[diff] [blame]	1062	document, while C<archive> expects a KorAP-XML corpus folder or a zip
				1063	file to batch process multiple files.
				1064	C<extract> expects zip files only.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1065
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1066	C<archive> supports multiple input zip files with the constraint,
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1067	that the first archive listed contains all primary data files
				1068	and all meta data files.
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1069
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1070	-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1071
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1072	Input may also be defined using BSD glob wildcards.
				1073
				1074	-i 'file/news*.zip'
				1075
				1076	The extended input array will be sorted in length order, so the shortest
				1077	path needs to contain all primary data files and all meta data files.
				1078
Akron	0c3e375	2016-06-28 15:55:53 +0200	[diff] [blame]	1079	(The directory structure follows the base directory format,
				1080	that may include a C<.> root folder.
				1081	In this case further archives lacking a C<.> root folder
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1082	need to be passed with a hash sign in front of the archive's name.
				1083	This may require to quote the parameter.)
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1084
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1085	To support zip files, a version of C<unzip> needs to be installed that is
				1086	compatible with the archive file.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	1087
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1088	B<The root folder switch using the hash sign is experimental and
				1089	may vanish in future versions.>
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	1090
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1091
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1092	=item B<--input-base\|-ib> <directory>
				1093
				1094	The base directory for inputs.
				1095
				1096
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1097	=item B<--output\|-o> <directory\|file>
				1098
				1099	Output folder for archive processing or
				1100	document name for single output (optional),
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1101	writes to C<STDOUT> by default
				1102	(in case C<output> is not mandatory due to further options).
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1103
				1104	=item B<--overwrite\|-w>
				1105
				1106	Overwrite files that already exist.
				1107
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1108
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1109	=item B<--token\|-t> <foundry>#<file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1110
				1111	Define the default tokenization by specifying
				1112	the name of the foundry and optionally the name
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1113	of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1114	This will directly take the file instead of running
				1115	the layer implementation!
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1116
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1117
				1118	=item B<--base-sentences\|-bs> <foundry>#<layer>
				1119
				1120	Define the layer for base sentences.
				1121	If given, this will be used instead of using C<Base#Sentences>.
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	1122	Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
				1123	layers supported.
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1124
				1125	Defaults to unset.
				1126
				1127
				1128	=item B<--base-paragraphs\|-bp> <foundry>#<layer>
				1129
				1130	Define the layer for base paragraphs.
				1131	If given, this will be used instead of using C<Base#Paragraphs>.
				1132	Currently C<DeReKo#Structure> is the only additional layer supported.
				1133
				1134	Defaults to unset.
				1135
				1136
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	1137	=item B<--base-pagebreaks\|-bpb> <foundry>#<layer>
				1138
				1139	Define the layer for base pagebreaks.
				1140	Currently C<DeReKo#Structure> is the only layer supported.
				1141
				1142	Defaults to unset.
				1143
				1144
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1145	=item B<--skip\|-s> <foundry>[#<layer>]
				1146
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1147	Skip specific annotations by specifying the foundry
				1148	(and optionally the layer with a C<#>-prefix),
				1149	e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1150	Can be set multiple times.
				1151
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1152
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1153	=item B<--anno\|-a> <foundry>#<layer>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1154
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1155	Convert specific annotations by specifying the foundry
				1156	(and optionally the layer with a C<#>-prefix),
				1157	e.g. C<Mate> or C<Mate#Morpho>.
				1158	Can be set multiple times.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1159
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1160
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1161	=item B<--primary\|-p>
				1162
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1163	Output primary data or not. Defaults to C<true>.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1164	Can be flagged using C<--no-primary> as well.
				1165	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1166
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1167
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	1168	=item B<--non-word-tokens\|-nwt>
				1169
				1170	Tokenize non-word tokens like word tokens (defined as matching
				1171	C</[\d\w]/>). Useful to treat punctuations as tokens.
				1172
				1173	Defaults to unset.
				1174
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1175
				1176	=item B<--non-verbal-tokens\|-nvt>
				1177
				1178	Tokenize non-verbal tokens marked as in the primary data as
				1179	the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
				1180
				1181	Defaults to unset.
				1182
				1183
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1184	=item B<--jobs\|-j>
				1185
				1186	Define the number of concurrent jobs in seperated forks
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1187	for archive processing.
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1188	Defaults to C<0> (everything runs in a single process).
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1189
				1190	If C<sequential-extraction> is not set to false, this will
				1191	also apply to extraction.
				1192
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	1193	Pass -1, and the value will be set automatically to 5
				1194	times the number of available cores.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1195	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1196
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1197
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	1198	=item B<--koral\|-k>
				1199
				1200	Version of the output format. Supported versions are:
				1201	C<0> for legacy serialization, C<0.03> for serialization
				1202	with metadata fields as key-values on the root object,
				1203	C<0.4> for serialization with metadata fields as a list
				1204	of C<"@type":"koral:field"> objects.
				1205
				1206	Currently defaults to C<0.03>.
				1207
				1208
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1209	=item B<--sequential-extraction\|-se>
				1210
				1211	Flag to indicate, if the C<jobs> value also applies to extraction.
				1212	Some systems may have problems with extracting multiple archives
				1213	to the same folder at the same time.
				1214	Can be flagged using C<--no-sequential-extraction> as well.
				1215	Defaults to C<false>.
				1216
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1217
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1218	=item B<--meta\|-m>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1219
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1220	Define the metadata parser to use. Defaults to C<I5>.
				1221	Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
				1222	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1223
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1224
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1225	=item B<--pretty\|-y>
				1226
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1227	Pretty print JSON output. Defaults to C<false>.
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1228	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1229
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1230
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1231	=item B<--gzip\|-z>
				1232
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1233	Compress the output.
				1234	Expects a defined C<output> file in single processing.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1235
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1236
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1237	=item B<--cache\|-c>
				1238
				1239	File to mmap a cache (using L<Cache::FastMmap>).
				1240	Defaults to C<korapxml2krill.cache> in the calling directory.
				1241
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1242
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1243	=item B<--cache-size\|-cs>
				1244
				1245	Size of the cache. Defaults to C<50m>.
				1246
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1247
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1248	=item B<--cache-init\|-ci>
				1249
				1250	Initialize cache file.
				1251	Can be flagged using C<--no-cache-init> as well.
				1252	Defaults to C<true>.
				1253
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1254
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1255	=item B<--cache-delete\|-cd>
				1256
				1257	Delete cache file after processing.
				1258	Can be flagged using C<--no-cache-delete> as well.
				1259	Defaults to C<true>.
				1260
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1261
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1262	=item B<--config\|-cfg>
				1263
				1264	Configure the parameters of your call in a file
				1265	of key-value pairs with whitespace separator
				1266
				1267	overwrite 1
				1268	token DeReKo#Structure
				1269	...
				1270
				1271	Supported parameters are:
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1272	C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1273	C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1274	C<output>, C<koral>,
				1275	C<tempary-extract>, C<sequential-extraction>,
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1276	C<base-sentences>, C<base-paragraphs>,
				1277	C<base-pagebreaks>,
				1278	C<skip> (semicolon separated), C<sigle>
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1279	(semicolon separated), C<anno> (semicolon separated).
				1280
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1281	Configuration parameters will always be overwritten by
				1282	passed parameters.
				1283
				1284
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1285	=item B<--temporary-extract\|-te>
				1286
				1287	Only valid for the C<archive> command.
				1288
				1289	This will first extract all files into a
				1290	directory and then will archive.
				1291	If the directory is given as C<:temp:>,
				1292	a temporary directory is used.
				1293	This is especially useful to avoid
				1294	massive unzipping and potential
				1295	network latency.
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1296
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1297
Akron	c93a080	2019-07-11 15:48:34 +0200	[diff] [blame]	1298	=item B<--to-tar>
				1299
				1300	Only valid for the C<archive> command.
				1301
				1302	Writes the output into a tar archive.
				1303
				1304
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1305	=item B<--sigle\|-sg>
				1306
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1307	Extract the given texts.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1308	Can be set multiple times.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1309	I<Currently only supported on C<extract>.>
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	1310	Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1311	In case the C<Text> path is omitted, the whole document will be extracted.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1312	On the document level, the postfix wildcard C<*> is supported.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1313
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1314
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1315	=item B<--log\|-l>
				1316
				1317	The L<Log4perl> log level, defaults to C<ERROR>.
				1318
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1319
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1320	=item B<--help\|-h>
				1321
Akron	42f48c1	2020-02-14 13:08:13 +0100	[diff] [blame]	1322	Print help information.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1323
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1324
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1325	=item B<--version\|-v>
				1326
				1327	Print version information.
				1328
				1329	=back
				1330
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1331
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1332	=head1 ANNOTATION SUPPORT
				1333
				1334	L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
				1335	developed in the KorAP project that are part of the KorAP preprocessing pipeline.
				1336	The base foundry with paragraphs, sentences, and the text element are mandatory for
				1337	L<Krill\|https://github.com/KorAP/Krill>.
				1338
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1339	Base
				1340	#Paragraphs
				1341	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1342
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1343	Connexor
				1344	#Morpho
				1345	#Phrase
				1346	#Sentences
				1347	#Syntax
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1348
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1349	CoreNLP
				1350	#Constituency
				1351	#Morpho
				1352	#NamedEntities
				1353	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1354
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	1355	CMC
				1356	#Morpho
				1357
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1358	DeReKo
				1359	#Structure
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1360
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	1361	DGD
				1362	#Morpho
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	1363	#Structure
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	1364
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1365	DRuKoLa
				1366	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1367
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1368	Glemm
				1369	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1370
Akron	ea1aed5	2018-07-19 14:43:34 +0200	[diff] [blame]	1371	HNC
				1372	#Morpho
				1373
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	1374	LWC
				1375	#Dependency
				1376
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1377	Malt
				1378	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1379
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1380	MarMoT
				1381	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1382
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1383	Mate
				1384	#Dependency
				1385	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1386
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1387	MDParser
				1388	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1389
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1390	OpenNLP
				1391	#Morpho
				1392	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1393
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame]	1394	RWK
				1395	#Morpho
				1396	#Structure
				1397
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1398	Sgbr
				1399	#Lemma
				1400	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1401
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	1402	Talismane
				1403	#Dependency
				1404	#Morpho
				1405
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1406	TreeTagger
				1407	#Morpho
				1408	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1409
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1410	XIP
				1411	#Constituency
				1412	#Morpho
				1413	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1414
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1415
				1416	More importers are in preparation.
				1417	New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
				1418	See the built-in annotation importers as examples.
				1419
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1420
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1421	=head1 About KorAP-XML
				1422
				1423	KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
				1424	data model (Bański et al. 2013), where text data are stored physically
				1425	separated from their interpretations (i.e. annotations).
				1426	A text document in KorAP-XML therefore consists of several files
				1427	containing primary data, metadata and annotations.
				1428
				1429	The structure of a single KorAP-XML document can be as follows:
				1430
				1431	- data.xml
				1432	- header.xml
				1433	+ base
				1434	- tokens.xml
				1435	- ...
				1436	+ struct
				1437	- structure.xml
				1438	- ...
				1439	+ corenlp
				1440	- morpho.xml
				1441	- constituency.xml
				1442	- ...
				1443	+ tree_tagger
				1444	- morpho.xml
				1445	- ...
				1446	- ...
				1447
				1448	The C<data.xml> contains the primary data, the C<header.xml> contains
				1449	the metadata, and the annotation layers are stored in subfolders
				1450	like C<base>, C<struct> or C<corenlp>
				1451	(so-called "foundries"; Bański et al. 2013).
				1452
				1453	Metadata is available in the TEI-P5 variant I5
Akron	d4c5c10	2020-02-11 11:47:59 +0100	[diff] [blame]	1454	(Lüngen and Sperberg-McQueen 2012). See the documentation in
				1455	L<KorAP::XML::Meta::I5> for translatable fields.
				1456
				1457	Annotations correspond to a variant of the TEI-P5 feature structures
				1458	(TEI Consortium; Lee et al. 2004).
Akron	72bc522	2020-02-06 16:00:13 +0100	[diff] [blame]	1459	Annotation feature structures refer to character sequences of the primary text
				1460	inside the C<text> element of the C<data.xml>.
				1461	A single annotation containing the lemma of a token can have the following structure:
				1462
				1463	<span from="0" to="3">
				1464	<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
				1465	<f name="lex">
				1466	<fs>
				1467	<f name="lemma">zum</f>
				1468	</fs>
				1469	</f>
				1470	</fs>
				1471	</span>
				1472
				1473	The C<from> and C<to> attributes are refering to the character span
				1474	in the primary text.
				1475	Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
				1476	the structure may vary. See L<KorAP::XML::Annotation::*> for various
				1477	annotation preprocessors.
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1478
				1479	Multiple KorAP-XML documents are organized on three levels following
				1480	the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
				1481	corpus E<gt> document E<gt> text. On each level metadata information
				1482	can be stored, that C<korapxml2krill> will merge to a single metadata
				1483	object per text. A corpus is therefore structured as follows:
				1484
				1485	+ <corpus>
				1486	- header.xml
				1487	+ <document>
				1488	- header.xml
				1489	+ <text>
				1490	- data.xml
				1491	- header.xml
				1492	- ...
				1493	- ...
				1494
				1495	A single text can be identified by the concatenation of
				1496	the corpus identifier, the document identifier and the text identifier.
				1497	This identifier is called the text sigle
				1498	(e.g. a text with the identifier C<18486> in the document C<060> in the
				1499	corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
				1500
				1501	These corpora are often stored in zip files, with which C<korapxml2krill>
				1502	can deal with. Corpora may also be split in multiple zip archives
				1503	(e.g. one zip file per foundry), which is also supported (see C<--input>).
				1504
				1505	Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
				1506	in form of a test suite.
				1507	The resulting JSON format merges all annotation layers
				1508	based on a single token stream.
				1509
				1510	=head2 References
				1511
				1512	Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
				1513	KorAP data model: first approximation, December.
				1514
				1515	Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
				1516	"The New IDS Corpus Analysis Platform: Challenges and Prospects",
				1517	Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
				1518	L<PDF\|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
				1519
				1520	Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
				1521	"Robust corpus architecture: a new look at virtual collections and data access",
				1522	Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
				1523	L<PDF\|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
				1524
				1525	Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
				1526	Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
				1527	"Towards an international standard on featurestructure representation",
				1528	Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
				1529	pp. 373-376.
				1530	L<PDF\|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
				1531
				1532	Harald Lüngen and C. M. Sperberg-McQueen (2012):
				1533	"A TEI P5 Document Grammar for the IDS Text Model",
				1534	Journal of the Text Encoding Initiative, Issue 3 \| November 2012.
				1535	L<PDF\|https://journals.openedition.org/jtei/pdf/508>
				1536
				1537	TEI Consortium, eds:
				1538	"Feature Structures",
				1539	Guidelines for Electronic Text Encoding and Interchange.
				1540	L<html\|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
				1541
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1542	=head1 AVAILABILITY
				1543
				1544	https://github.com/KorAP/KorAP-XML-Krill
				1545
				1546
				1547	=head1 COPYRIGHT AND LICENSE
				1548
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1549	Copyright (C) 2015-2020, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1550
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1551	Author: L<Nils Diewald\|https://nils-diewald.de/>
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1552
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1553	Contributor: Eliza Margaretha
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1554
				1555	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				1556	Corpus Analysis Platform at the
Akron	94262ce	2019-02-28 21:42:43 +0100	[diff] [blame]	1557	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1558	member of the
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1559	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1560
				1561	This program is free software published under the
				1562	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				1563
				1564	=cut