Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: 0138423d8d8e26a67f7953a8798ac387cc00146c [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	4	use FindBin;
				5	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				6	use File::Spec::Functions qw/catfile catdir/;
				7	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	8	use Benchmark qw/:hireswallclock/;
				9	use IO::Compress::Gzip qw/$GzipError/;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	10	use POSIX qw/ceil/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	11	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	12	use Pod::Usage;
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	13	use Cache::FastMmap;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	14	use Directory::Iterator;
Akron	c4ec093	2020-08-06 09:19:22 +0200	[diff] [blame]	15	use KorAP::XML::Krill qw!get_file_name_from_glob!;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	16	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	17	use KorAP::XML::Tokenizer;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	18	use KorAP::XML::Batch::File;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	19	use Config::Simple;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	20	use Parallel::ForkManager;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	21	use v5.10;
				22	use Sys::Info;
				23	use Sys::Info::Constants qw( :device_cpu );
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	24	use File::Glob ':bsd_glob';
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	25	use File::Temp qw/tempdir/;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	26	use File::Path qw(remove_tree make_path);
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	27	use File::Basename;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	28	use Mojo::Collection 'c';
				29	use String::Random qw(random_string);
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	30	use IO::File;
				31	use Archive::Tar::Builder;
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	32	use Fcntl qw(:flock SEEK_END);
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	33
				34	# use KorAP::XML::ForkPool;
Akron	75ba57d	2016-03-07 23:36:27 +0100	[diff] [blame]	35	# TODO: use Parallel::Loops
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	36	# TODO: make output files
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	37
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	38	# TODO: Use KorAP::XML::ForkPool!
				39
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	40	# CHANGES:
				41	# ----------------------------------------------------------
				42	# 2013/11/25
				43	# - Initial release
				44	#
				45	# 2014/10/29
				46	# - Merges foundry data to create indexer friendly documents
				47	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	48	# 2016/02/04
				49	# - renamed to korapxml2krill
				50	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	51	#
				52	# 2016/02/12
				53	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	54	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	55	#
				56	# 2016/02/14
				57	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	58	# - Added support for archive files
				59	#
				60	# 2016/02/15
				61	# - Fixed temporary directory bug
				62	# - Improved skipping before unzipping
				63	# - Added EXPERIMENTAL concurrency support
				64	#
				65	# 2016/02/23
				66	# - Merge korapxml2krill and korapxml2krill_dir
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	67	#
				68	# 2016/02/27
				69	# - Added extract function
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	70	#
				71	# 2016/03/17
				72	# - Added meta switch
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	73	#
				74	# 2016/03/18
				75	# - Added meta data caching
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	76	#
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	77	# 2016/06/27
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	78	# - Added multi archive support
				79	# - Added prefix negation support
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	80	# - Added Malt#Dependency support
Akron	8b99052	2016-07-06 16:45:57 +0200	[diff] [blame]	81	#
				82	# 2016/07/06
				83	# - Added MDParser#Dependency
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	84	#
				85	# 2016/10/15
Nils Diewald	0e48977	2016-10-24 15:16:52 +0200	[diff] [blame]	86	# - Fixed temporary path issue in script
				87	#
				88	# 2016/10/24
				89	# - Improved Windows support
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	90	#
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	91	# 2016/10/24
				92	# - Added support for document extraction
				93	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	94	# 2016/10/27
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	95	# - Added wildcard support for document extraction
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	96	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	97	# 2016/12/21
				98	# - added support for base-sentences and base-tokenizations
				99	#
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	100	# 2017/01/20
				101	# - added support for DRuKoLa annotations
				102	#
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	103	# 2017/02/08
				104	# - added support for pagebreak annotations
				105	#
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	106	# 2017/04/06
				107	# - added support for wildcards in input
				108	#
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	109	# 2017/04/07
				110	# - support configuration option
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	111	# - support for temporary extraction
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	112	#
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	113	# 2017/04/12
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	114	# - support serial processing
				115	# - support input root
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	116	# - introduced --sequential-extraction flag
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	117	#
				118	# 2017/06/19
				119	# - added support for DCK
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	120	#
				121	# 2017/06/29
				122	# - Fixed exit codes
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	123	#
				124	# 2017/07/04
				125	# - Fixed tar building process
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	126	#
				127	# 2018/01/16
				128	# - Added LWC support
Akron	5fdc7e1	2018-07-19 12:37:48 +0200	[diff] [blame]	129	#
				130	# 2018/07/19
				131	# - Preliminary support for HNC.
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	132	#
				133	# 2019/01/22
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	134	# - Preliminary support for DGD.
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	135	# - Support for non-word tokens.
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	136	#
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	137	# 2019/02/13
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	138	# - Support for 'koral:field' array.
				139	# - Support for Koral versioning.
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	140	# - Ignore temporary extract parameter on
				141	# directory archiving.
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	142	#
				143	# 2019/08/08
				144	# - Support for Talismane.
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	145	#
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	146	# 2019/12/17
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	147	# - Added support for DGD pseudo-sentences
				148	# based on anchor milestones.
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	149	# - Support for non-verbal annotations.
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame]	150	#
				151	# 2020/04/23
				152	# - Added support for Redewiedergabe-Korpus structure
				153	# annotations, based on sentence and paragraph milestones
				154	# - Added support for Redewiedergabe-Korpus morphology
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	155	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	156
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame]	157	our $LAST_CHANGE = '2020/04/23';
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	158	our $LOCAL = $FindBin::Bin;
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	159	our $KORAL_VERSION = 0.03;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	160	our $VERSION_MSG = <<"VERSION";
				161	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				162	VERSION
				163
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	164	# Prototypes
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	165	sub get_file_name($);
				166
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	167	# Parse comand
				168	my $cmd;
				169	our @ARGV;
				170	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				171	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	172	};
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	173	my @keep_argv = @ARGV;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	174
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	175	my (@skip, @sigle, @anno, @input);
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	176	my $text;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	177
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	178	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	179	GetOptions(
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	180	'input\|i=s' => \@input,
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	181	'input-base\|ib=s' => \(my $input_base),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	182	'output\|o=s' => \(my $output),
				183	'overwrite\|w' => \(my $overwrite),
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	184	'meta\|m=s' => \(my $meta),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	185	'token\|t=s' => \(my $token_base),
				186	'base-sentences\|bs=s' => \(my $base_sentences),
				187	'base-paragraphs\|bp=s' => \(my $base_paragraphs),
				188	'base-pagebreaks\|bpb=s' => \(my $base_pagebreaks),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	189	'gzip\|z' => \(my $gzip),
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	190	'temporary-extract\|te=s' => \(my $extract_dir),
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	191	'skip\|s=s' => \@skip,
				192	'sigle\|sg=s' => \@sigle,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	193	'cache\|c=s' => \(my $cache_file),
				194	'config\|cfg=s' => \(my $cfg_file),
				195	'log\|l=s' => \(my $log_level),
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	196	'anno\|a=s' => \@anno,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	197	'primary\|p!' => \(my $primary),
				198	'pretty\|y' => \(my $pretty),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	199	'jobs\|j=i' => \(my $jobs),
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	200	'koral\|k=f' => \(my $koral),
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	201	'to-tar' => \(my $to_tar),
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	202	'non-word-tokens\|nwt' => \(my $non_word_tokens),
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	203	'non-verbal-tokens\|nvt' => \(my $non_verbal_tokens),
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	204	'sequential-extraction\|se' => \(my $sequential_extraction),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	205	'cache-size\|cs=s' => \(my $cache_size),
				206	'cache-delete\|cd!' => \(my $cache_delete),
				207	'cache-init\|ci!' => \(my $cache_init),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	208	'help\|h' => sub {
				209	pod2usage(
				210	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	211	-verbose => 99,
				212	-msg => $VERSION_MSG,
				213	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	214	);
				215	},
				216	'version\|v' => sub {
				217	pod2usage(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	218	-verbose => 0,
				219	-msg => $VERSION_MSG,
				220	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	221	)
				222	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	223	);
				224
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	225
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	226	# Load from configuration
				227	if ($cfg_file && -e $cfg_file) {
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	228	my %config;
				229
				230	Config::Simple->import_from($cfg_file, \%config);
				231
				232	# Overwrite
				233	if (!defined($overwrite) && defined $config{overwrite}) {
				234	$overwrite = $config{overwrite};
				235	};
				236
				237	# Gzip
				238	if (!defined($gzip) && defined $config{gzip}) {
				239	$gzip = $config{gzip};
				240	};
				241
				242	# Jobs
				243	if (!defined($jobs) && defined $config{jobs}) {
				244	$jobs = $config{jobs};
				245	};
				246
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	247	# Koral version
				248	if (!defined($koral) && defined $config{koral}) {
				249	$koral = $config{koral};
				250	};
				251
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	252	# Input root base directory
				253	if (!defined($input_base) && defined $config{'input-base'}) {
				254	$input_base = $config{'input-base'};
				255	};
				256
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	257	# temporary-extract
				258	if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
				259	$extract_dir = $config{'temporary-extract'};
				260	};
				261
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	262	# Token base
				263	if (!defined($token_base) && defined $config{token}) {
				264	$token_base = $config{token};
				265	};
				266
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	267	# Non-word tokenization
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	268	if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
				269	$non_word_tokens = $config{'non-word-tokens'};
				270	};
				271
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	272	# Non-verbal tokenization
				273	if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
				274	$non_verbal_tokens = $config{'non-verbal-tokens'};
				275	};
				276
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	277	# Cache file
				278	if (!defined($cache_file) && defined $config{cache}) {
				279	$cache_file = $config{cache};
				280	};
				281
				282	# Cache size
				283	if (!defined($cache_size) && defined $config{'cache-size'}) {
				284	$cache_size = $config{'cache-size'};
				285	};
				286
				287	# Cache delete
				288	if (!defined($cache_delete) && defined $config{'cache-delete'}) {
				289	$cache_delete = $config{'cache-delete'} ;
				290	};
				291
				292	# Cache init
				293	if (!(defined $cache_init) && defined $config{'cache-init'}) {
				294	$cache_init = $config{'cache-init'} ;
				295	};
				296
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	297	# Jobs for extraction
				298	if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
				299	$sequential_extraction = $config{'sequential-extraction'} ;
				300	};
				301
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	302	# Meta
				303	if (!(defined $meta) && defined $config{'meta'}) {
				304	$meta = $config{'meta'} ;
				305	};
				306
				307	# Output
				308	if (!(defined $output) && defined $config{'output'}) {
				309	$output = $config{'output'} ;
				310	};
				311
				312	# Base-sentences
				313	if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
				314	$base_sentences = $config{'base-sentences'} ;
				315	};
				316
				317	# Base-paragraphs
				318	if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
				319	$base_paragraphs = $config{'base-paragraphs'} ;
				320	};
				321
				322	# Base-pagebreaks
				323	if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
				324	$base_pagebreaks = $config{'base-pagebreaks'} ;
				325	};
				326
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	327	# Write to tar
				328	if (!(defined $to_tar) && defined $config{'to-tar'}) {
				329	$to_tar = $config{'to-tar'} ;
				330	};
				331
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	332	# Log
				333	if (!(defined $log_level) && defined $config{'log'}) {
				334	$log_level = $config{'log'} ;
				335	};
				336
				337	# Skip
				338	if (!scalar(@skip) && defined $config{'skip'}) {
				339	@skip = split /\s;\s/, $config{'skip'} ;
				340	};
				341
				342	# Sigle
				343	if (!scalar(@sigle) && defined $config{'sigle'}) {
				344	@sigle = split /\s;\s/, $config{'sigle'} ;
				345	};
				346
				347	# Anno
				348	if (!scalar(@anno) && defined $config{'anno'}) {
				349	@anno = split /\s;\s/, $config{'anno'} ;
				350	};
				351	};
				352
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	353
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	354	# Set default token base
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	355	$token_base //= 'OpenNLP#tokens';
				356	$cache_file //= 'korapxml2krill.cache';
				357	$cache_size //= '50m';
				358	$jobs //= 0;
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	359	$koral //= $KORAL_VERSION;
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	360	$cache_delete //= 1;
				361	$cache_init //= 1;
				362	$sequential_extraction //= 0;
				363	$log_level //= 'ERROR';
				364	$base_sentences //= '';
				365	$base_paragraphs //= '';
				366	$base_pagebreaks //= '';
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	367	$non_word_tokens //= 0;
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	368	$non_verbal_tokens //= 0;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	369
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	370	$base_sentences = lc $base_sentences;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	371	$base_paragraphs = lc $base_paragraphs;
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	372	$base_pagebreaks = lc $base_pagebreaks;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	373
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	374
				375	# Initialize log4perl object
				376	Log::Log4perl->init({
				377	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
				378	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				379	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				380	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				381	});
				382
				383	my $log = Log::Log4perl->get_logger('main');
				384
				385
				386	print "Reading config from $cfg_file\n" if $cfg_file;
				387
				388
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	389	my %ERROR_HASH = (
				390	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	391	-verbose => 99,
				392	-msg => $VERSION_MSG,
				393	-output => '-',
				394	-exit => 1
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	395	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	396
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	397	# Input has to be defined
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	398	pod2usage(%ERROR_HASH) unless @input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	399
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	400	# Gzip has no effect, if no output is given
				401	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	402
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	403
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	404	if ($jobs eq '-1') {
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	405	state $cores = Sys::Info->new->device('CPU')->count;
				406	$jobs = ceil(5 * $cores);
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	407	$log->info("Run using $jobs jobs on $cores cores");
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	408	};
				409
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	410
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	411	# Start serial processing
Akron	28c4e54	2017-07-04 20:30:33 +0200	[diff] [blame]	412	if ($cmd && $cmd eq 'serial') {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	413
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	414	if ($output && (!defined($to_tar)) && (!-e $output \|\| !-d $output)) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	415	$log->error("Directory '$output' does not exist.");
				416	exit 1;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	417	};
				418
				419	# Remove all inputs
				420	my $remove_next = 0;
				421	@keep_argv = @{c(@keep_argv)->grep(
				422	sub {
				423	# Input flag
				424	if ($_ eq '-i' \|\| $_ eq '--input' \|\| $_ eq '--output' \|\| $_ eq '-o') {
				425	$remove_next = 1;
				426	return 0;
				427	}
				428
				429	# input value
				430	elsif ($remove_next) {
				431	$remove_next = 0;
				432	return 0;
				433	};
				434
				435	# Pass parameter
				436	return 1;
				437	}
				438	)->to_array};
				439
				440
				441	# Iterate over all inputs
				442	foreach (@input) {
				443
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	444	# This will create a directory
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	445	my $new_out = catdir($output, get_file_name_from_glob($_));
				446
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	447	# Create new path, in case the output is not meant to be tarred
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	448	unless ($to_tar) {
				449	if (make_path($new_out) == 0 && !-d $new_out) {
				450	$log->error("Can\'t create path $new_out");
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	451	exit 1;
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	452	};
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	453	};
				454
				455	# Create archive command
				456	my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
				457	print "Start serial processing of $_ to $new_out\n";
				458
				459	# Start archiving
				460	system @archive_cmd;
				461	};
				462
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	463	exit;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	464	};
				465
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	466	my %skip;
				467	$skip{lc($_)} = 1 foreach @skip;
				468
				469	my @layers;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	470	push(@layers, ['Base', 'Sentences']) unless $base_sentences;
				471	push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	472
				473	# Connexor
				474	push(@layers, ['Connexor', 'Morpho']);
				475	push(@layers, ['Connexor', 'Syntax']);
				476	push(@layers, ['Connexor', 'Phrase']);
				477	push(@layers, ['Connexor', 'Sentences']);
				478
				479	# CoreNLP
				480	push(@layers, ['CoreNLP', 'NamedEntities']);
				481	push(@layers, ['CoreNLP', 'Sentences']);
				482	push(@layers, ['CoreNLP', 'Morpho']);
				483	push(@layers, ['CoreNLP', 'Constituency']);
				484
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	485	# CMC
				486	push(@layers, ['CMC', 'Morpho']);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	487
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	488	# DeReKo
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	489	my @dereko_attr = ();
				490	if ($base_sentences eq 'dereko#structure') {
				491	push @dereko_attr, 'sentences';
				492	};
				493	if ($base_paragraphs eq 'dereko#structure') {
				494	push @dereko_attr, 'paragraphs';
				495	};
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	496
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	497	if ($base_pagebreaks eq 'dereko#structure') {
				498	push @dereko_attr, 'pagebreaks';
				499	};
				500
				501	if ($dereko_attr[0]) {
				502	push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	503	}
				504	else {
				505	push(@layers, ['DeReKo', 'Structure']);
				506	};
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	507
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	508	# DGD
				509	push(@layers, ['DGD', 'Morpho']);
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	510	if ($base_sentences eq 'dgd#structure') {
				511	push(@layers, ['DGD', 'Structure', 'base-sentence']);
				512	}
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	513
				514	# DRuKoLa
				515	push(@layers, ['DRuKoLa', 'Morpho']);
				516
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	517	# Glemm
				518	push(@layers, ['Glemm', 'Morpho']);
				519
Akron	ea1aed5	2018-07-19 14:43:34 +0200	[diff] [blame]	520	# HNC
				521	push(@layers, ['HNC', 'Morpho']);
				522
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	523	# LWC
				524	push(@layers, ['LWC', 'Dependency']);
				525
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	526	# Malt
				527	push(@layers, ['Malt', 'Dependency']);
				528
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	529	# Marmot
				530	push(@layers, ['MarMoT', 'Morpho']);
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	531
				532	# Mate
				533	push(@layers, ['Mate', 'Morpho']);
				534	push(@layers, ['Mate', 'Dependency']);
				535
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	536	# MDParser
				537	push(@layers, ['MDParser', 'Dependency']);
				538
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	539	# OpenNLP
				540	push(@layers, ['OpenNLP', 'Morpho']);
				541	push(@layers, ['OpenNLP', 'Sentences']);
				542
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame]	543	# Redewiedergabe
				544	push(@layers, ['RWK', 'Morpho']);
				545	if ($base_sentences eq 'rwk#structure') {
				546	push(@layers, ['RWK', 'Structure']);
				547	};
				548
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	549	# Schreibgebrauch
				550	push(@layers, ['Sgbr', 'Lemma']);
				551	push(@layers, ['Sgbr', 'Morpho']);
				552
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	553	# Talismane
				554	push(@layers, ['Talismane', 'Dependency']);
				555	push(@layers, ['Talismane', 'Morpho']);
				556
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	557	# TreeTagger
				558	push(@layers, ['TreeTagger', 'Morpho']);
				559	push(@layers, ['TreeTagger', 'Sentences']);
				560
				561	# XIP
				562	push(@layers, ['XIP', 'Morpho']);
				563	push(@layers, ['XIP', 'Constituency']);
				564	push(@layers, ['XIP', 'Sentences']);
				565	push(@layers, ['XIP', 'Dependency']);
				566
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	567
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	568	# Check filters
				569	my @filtered_anno;
				570	if ($skip{'#all'}) {
				571	foreach (@anno) {
				572	push @filtered_anno, [ split('#', $_) ];
				573	};
				574	}
				575
				576	# Add all annotations that are not skipped
				577	else {
				578	# Add to index file - respect skipping
				579	foreach my $info (@layers) {
				580	# Skip if Foundry or Foundry#Layer should be skipped
				581	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				582	push @filtered_anno, $info;
				583	};
				584	};
				585	};
				586
				587	# Get tokenization basis
Akron	3c56f50	2017-10-24 15:37:27 +0200	[diff] [blame]	588	my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
				589
				590	# Remove file extension
				591	$token_base_layer =~ s/\.xml$//i;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	592
				593	# TODO: This should not be initialized for batch
				594	my $cache = Cache::FastMmap->new(
				595	share_file => $cache_file,
				596	cache_size => $cache_size,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	597	init_file => $cache_init
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	598	);
				599
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	600	# Create batch object
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	601	my $batch_file = KorAP::XML::Batch::File->new(
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	602	cache => $cache,
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	603	meta_type => $meta,
				604	overwrite => $overwrite,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	605	foundry => $token_base_foundry,
				606	layer => $token_base_layer,
				607	gzip => $gzip,
				608	log => $log,
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	609	koral => $koral,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	610	primary => $primary,
				611	pretty => $pretty,
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	612	anno => \@filtered_anno,
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	613	non_word_tokens => $non_word_tokens,
				614	non_verbal_tokens => $non_verbal_tokens
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	615	);
				616
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	617	# Get file name based on path information
				618	sub get_file_name ($) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	619	my $i = $input[0];
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	620	if (-d $i) {
				621	$i =~ s![^\/]+$!!;
				622	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	623	my $file = shift;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	624
				625	# Remove temp dir fragments
Akron	6255760	2016-06-27 14:10:13 +0200	[diff] [blame]	626	$file =~ s!^/?tmp/[^/]+!!;
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	627	$file =~ s/^?\/?$i//;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	628	$file =~ tr/\//-/;
				629	$file =~ s{^-+}{};
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	630	$file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	631	return $file;
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	632	};
				633
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	634
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	635	# Convert sigle to path construct
				636	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;
				637
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	638	if ($cmd) {
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	639	if ($output && (!defined($to_tar)) && (!-e $output \|\| !-d $output)) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	640	$log->error("Directory '$output' does not exist.");
				641	exit 1;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	642	};
				643	};
				644
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	645
				646	# Glob and prefix files
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	647	if (@input) {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	648
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	649	my @new_input = ();
				650
				651	# Iterate over all inputs
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	652	foreach my $wild_card (@input) {
				653
				654	# Prefix with input root
				655	$wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
				656
				657	push (@new_input, bsd_glob($wild_card));
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	658	};
				659
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	660	# Sort files by length
				661	@input = sort { length($a) <=> length($b) } @new_input;
				662
				663	print 'Input is ' . join(', ', @input)."\n";
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	664	};
				665
				666
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	667	# Process a single file
				668	unless ($cmd) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	669	my $input = $input[0];
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	670
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	671	BEGIN {
				672	$main::TIME = Benchmark->new;
				673	$main::LAST_STOP = Benchmark->new;
				674	};
				675
				676	sub stop_time {
				677	my $new = Benchmark->new;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	678	$log->info(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	679	'The code took: '.
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	680	timestr(timediff($new, $main::LAST_STOP)) .
				681	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				682	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	683	$main::LAST_STOP = $new;
				684	};
				685
				686	# Create and parse new document
				687	$input =~ s{([^/])$}{$1/};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	688
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	689	# Process file
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	690	$batch_file->process($input, $output);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	691
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	692	# Delete cache file
				693	unlink($cache_file) if $cache_delete;
				694
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	695	stop_time;
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	696	exit;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	697	};
				698
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	699
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	700	# Extract XML files
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	701	if ($cmd eq 'extract') {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	702
Akron	d5643ad	2017-07-04 20:27:13 +0200	[diff] [blame]	703	# Output is required
				704	pod2usage(%ERROR_HASH) unless $output;
				705
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	706	# Create new archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	707	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	708
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	709	# Check zip capabilities
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	710	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	711	$log->error("Unzip is not installed or incompatible.");
				712	exit 1;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	713	};
				714
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	715	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	716	$archive->attach($_) foreach @input[1..$#input];
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	717
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	718	# Will set @sigle
				719	my $prefix = set_sigle($archive);
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	720
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	721	# Iterate over all given sigles and extract
				722	foreach (@sigle) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	723
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	724	print "$_ ...\n";
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	725
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	726	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	727	print '... ' . (
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	728
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	729	# TODO:
				730	# - prefix???
				731	$archive->extract_sigle([$_], $output, $jobs)
				732	? '' : 'not '
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	733	);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	734	print "extracted.\n";
				735	};
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	736	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	737
				738	# Can't create archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	739	else {
				740	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	741	exit 1;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	742	};
				743	}
				744
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	745
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	746	# Process an archive
				747	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	748
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	749	my $archive_output;
				750
				751	# First extract, then archive
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	752	if (defined $extract_dir && !-d $input[0]) {
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	753
				754	# Create new archive object
				755	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
				756
				757	# Check zip capabilities
				758	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	759	$log->error("Unzip is not installed or incompatible.");
				760	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	761	};
				762
				763	# Add further annotation archived
				764	$archive->attach($_) foreach @input[1..$#input];
				765
				766	# Create a temporary directory
				767	if ($extract_dir eq ':temp:') {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	768	$extract_dir = tempdir(CLEANUP => 0);
				769	print "Temporarily extract to $extract_dir\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	770	};
				771
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	772	# Add some random extra to avoid clashes with multiple archives
				773	$extract_dir = catdir($extract_dir, random_string('cccccc'));
				774
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	775	# Extract to temporary directory
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	776	if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	777	@input = ($extract_dir);
				778	}
				779	else {
				780	$log->error('Unable to extract from primary archive ' . $input[0] .
				781	' to ' . $extract_dir);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	782	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	783	};
				784	}
				785
				786	# Can't create archive object
				787	else {
				788	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	789	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	790	};
				791	};
				792
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	793	# Zero means: everything runs in the parent process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	794	my $pool = Parallel::ForkManager->new($jobs);
				795
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	796	my $count = 0; # Texts to process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	797	my $iter = 1; # Current text in process
				798
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	799	my $tar_archive;
				800	my $output_dir = $output;
				801	my $tar_fh;
				802
				803	# Initialize tar archive
				804	if ($to_tar) {
				805	$tar_archive = Archive::Tar::Builder->new(
				806	ignore_errors => 1
				807	);
				808
				809	# Set output name
				810	my $tar_file = $output;
				811	unless ($tar_file =~ /\.tar$/) {
				812	$tar_file .= '.tar';
				813	};
				814
				815	# Initiate the tar file
				816	print "Writing to file $tar_file\n";
				817	$tar_fh = IO::File->new($tar_file, 'w');
				818	$tar_fh->binmode(1);
				819
				820	# Set handle
				821	$tar_archive->set_handle($tar_fh);
				822
				823	# Output to temporary directory
				824	$output_dir = File::Temp->newdir;
				825	};
				826
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	827	# Report on fork message
				828	$pool->run_on_finish (
				829	sub {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	830	my ($pid, $code) = @_;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	831	my $data = pop;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	832
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	833	print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	834	($iter++) . "/$count]" .
				835	($code ? " $code" : '') .
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	836	' ' . $data->[0] . "\n";
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	837
				838	if (!$code && $to_tar && $data->[2]) {
				839	my $filename = $data->[2];
				840
				841	# Lock filehandle
				842	if (flock($tar_fh, LOCK_EX)) {
				843
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	844	my $clean_file = fileparse($filename);
				845
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	846	# Archive and remove file
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	847	$tar_archive->archive_as($filename => $clean_file);
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	848	unlink $filename;
				849
				850	# Unlock filehandle
				851	flock($tar_fh, LOCK_UN);
				852	}
				853	else {
				854	$log->warn("Unable to add $filename to archive");
				855	};
				856	};
				857
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	858	$data->[1] = undef if $data->[1];
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	859	}
				860	);
				861
				862	my $t;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	863	my $temp;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	864	print "Reading data ...\n";
				865
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	866	# unless (Cache::FastMmap->new(
				867	# share_file => $cache_file,
				868	# cache_size => $cache_size,
				869	# init_file => $cache_init
				870	# )) {
				871	# print "Unable to intialize cache '$cache_file'\n\n";
				872	# exit(1);
				873	# };
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	874
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	875
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	876	# Input is a directory
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	877	if (-d $input[0]) {
				878	my $it = Directory::Iterator->new($input[0]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	879	my @dirs;
				880	my $dir;
				881
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	882	# Todo: Make a DO WHILE
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	883	while (1) {
				884	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	885	push @dirs, $dir;
				886	$it->prune;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	887	};
				888	last unless $it->next;
				889	};
				890
				891	print "Start processing ...\n";
				892	$t = Benchmark->new;
				893	$count = scalar @dirs;
				894
				895	DIRECTORY_LOOP:
				896	for (my $i = 0; $i < $count; $i++) {
				897
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	898	my $filename = catfile(
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	899	$output_dir,
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	900	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	901	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	902
				903	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	904	$pool->start and next DIRECTORY_LOOP;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	905
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	906	if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	907	$pool->finish(
				908	0,
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	909	[
				910	"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
				911	undef,
				912	$filename
				913	]
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	914	);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	915	}
				916	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	917	$pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	918	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	919	};
				920	}
				921
				922	# Input is a file
Akron	29866ac	2016-06-24 16:40:47 +0200	[diff] [blame]	923	elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	924
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	925	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	926	$log->error("Unzip is not installed or incompatible.");
				927	exit 1;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	928	};
				929
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	930	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	931	$archive->attach($_) foreach @input[1..$#input];
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	932
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	933	# Get sigles to extract
				934	my $prefix = set_sigle($archive);
				935
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	936	print "Start processing ...\n";
				937	$t = Benchmark->new;
				938	my @dirs = $archive->list_texts;
				939	$count = scalar @dirs;
				940
				941	ARCHIVE_LOOP:
				942	for (my $i = 0; $i < $count; $i++) {
				943
				944	# Split path information
				945	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				946
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	947	my $filename = catfile(
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	948	$output_dir,
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	949	get_file_name(
				950	catfile($corpus, $doc, $text)
				951	. '.json' . ($gzip ? '.gz' : '')
				952	)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	953	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	954
				955	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	956	$pool->start and next ARCHIVE_LOOP;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	957
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	958	# Create temporary file
				959	$temp = File::Temp->newdir;
				960
Akron	bdf434a	2016-10-24 17:42:07 +0200	[diff] [blame]	961	# TODO: Check if $filename exist at the beginning,
				962	# because extraction can be horrible slow!
				963
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	964	# Extract from archive
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	965	if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	966
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	967	# Create corpus directory
				968	my $input = catdir("$temp", $corpus);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	969
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	970	# Temporary directory
				971	my $dir = catdir($input, $doc, $text);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	972
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	973	# Write file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	974	if (my $return = $batch_file->process($dir => $filename)) {
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	975
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	976	# Delete temporary file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	977	$pool->finish(
				978	0,
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	979	[
				980	"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
				981	$temp,
				982	$filename
				983	]
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	984	);
				985	#$pool->finish(0, ["Processed " . $filename, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	986	}
				987	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	988	# Delete temporary file
				989	$pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	990	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	991	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	992
				993	# Unable to extract
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	994	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	995	$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	996	};
				997	};
				998	}
				999
				1000	else {
				1001	print "Input is neither a directory nor an archive.\n\n";
				1002	};
				1003
				1004	$pool->wait_all_children;
				1005
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1006	# Delete cache file
				1007	unlink($cache_file) if $cache_delete;
				1008
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	1009	# Close tar filehandle
				1010	if ($to_tar && $tar_fh) {
				1011	$tar_archive->finish;
				1012	$tar_fh->close;
				1013	print "Wrote to tar archive.\n";
				1014	};
				1015
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1016	print timestr(timediff(Benchmark->new, $t))."\n";
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1017	print "Done.\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1018	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1019
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1020
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1021	# For an archive, this will create the list
				1022	# of all sigles to process
				1023	sub set_sigle {
				1024	my $archive = shift;
				1025
				1026	my $prefix = 1;
				1027	my @dirs = ();
				1028
				1029	# No sigles given
				1030	unless (@sigle) {
				1031
				1032	# Get files
				1033	foreach ($archive->list_texts) {
				1034
				1035	push @dirs, $_;
				1036
				1037	# Split path information
				1038	($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
				1039
				1040	# TODO: Make this OS independent
				1041	push @sigle, join '/', $corpus, $doc, $text;
				1042	};
				1043	}
				1044
				1045	# Check sigle for doc sigles
				1046	else {
				1047	my @new_sigle;
				1048
				1049	my $prefix_check = 0;
				1050
				1051	# Iterate over all sigle
				1052	foreach (@sigle) {
				1053
				1054	# Sigle is a doc sigle
				1055	if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
				1056
				1057	print "$_ ...";
				1058	# Check if a prefix is needed
				1059	unless ($prefix_check) {
				1060
				1061	if ($prefix = $archive->check_prefix) {
				1062	print " with prefix ...";
				1063	};
				1064	$prefix_check = 1;
				1065	};
				1066
				1067	print "\n";
				1068
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1069	print '... ' . (
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	1070	$archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
				1071	? '' : 'not '
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1072	);
				1073	print "extracted.\n";
				1074	}
				1075
				1076	# Sigle is a text sigle
				1077	else {
				1078	push @new_sigle, $_;
				1079
				1080	unless ($prefix_check) {
				1081
				1082	if ($prefix = $archive->check_prefix) {
				1083	print " with prefix ...";
				1084	};
				1085	$prefix_check = 1;
				1086	};
				1087	};
				1088	};
				1089	@sigle = @new_sigle;
				1090	};
				1091
				1092	return $prefix;
				1093	};
				1094
				1095
				1096
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1097	# Cleanup temporary extraction directory
				1098	if ($extract_dir) {
				1099	my $objects = remove_tree($extract_dir, { safe => 1 });
				1100	print "Removed directory $extract_dir with $objects objects.\n";
				1101	};
				1102
				1103
				1104	print "\n";
				1105
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1106	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1107
				1108	=pod
				1109
				1110	=encoding utf8
				1111
				1112	=head1 NAME
				1113
Akron	42f48c1	2020-02-14 13:08:13 +0100	[diff] [blame]	1114	korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1115
				1116
				1117	=head1 SYNOPSIS
				1118
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1119	korapxml2krill [archive\|extract] --input <directory\|archive> [options]
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1120
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1121
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1122	=head1 DESCRIPTION
				1123
				1124	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				1125	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1126	The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1127
				1128
				1129	=head1 INSTALLATION
				1130
				1131	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				1132
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	1133	$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1134
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1135	In case everything went well, the C<korapxml2krill> tool will
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1136	be available on your command line immediately.
Akron	6eff23b	2018-09-24 10:31:20 +0200	[diff] [blame]	1137	Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	1138	In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1139
				1140	=head1 ARGUMENTS
				1141
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1142	$ korapxml2krill -z --input <directory> --output <filename>
				1143
				1144	Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1145	It expects the input to point to the text level folder.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1146
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1147	=over 2
				1148
				1149	=item B<archive>
				1150
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	1151	$ korapxml2krill archive -z --input <directory\|archive> --output <directory\|tar>
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1152
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1153	Converts an archive of KorAP-XML documents. It expects a directory
				1154	(pointing to the corpus level folder) or one or more zip files as input.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1155
				1156	=item B<extract>
				1157
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1158	$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
				1159
				1160	Extracts KorAP-XML documents from a zip file.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1161
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1162	=item B<serial>
				1163
				1164	$ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
				1165
				1166	Convert archives sequentially. The inputs are not merged but treated
				1167	as they are (so they may be premerged or globs).
				1168	the C<--out> directory is treated as the base directory where subdirectories
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	1169	are created based on the archive name. In case the C<--to-tar> flag is given,
				1170	the output will be a tar file.
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1171
				1172
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1173	=back
				1174
				1175
				1176	=head1 OPTIONS
				1177
				1178	=over 2
				1179
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1180	=item B<--input\|-i> <directory\|zip file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1181
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1182	Directory or zip file(s) of documents to convert.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1183
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1184	Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akron	f1a1de9	2016-11-02 17:32:12 +0100	[diff] [blame]	1185	document, while C<archive> expects a KorAP-XML corpus folder or a zip
				1186	file to batch process multiple files.
				1187	C<extract> expects zip files only.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1188
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1189	C<archive> supports multiple input zip files with the constraint,
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1190	that the first archive listed contains all primary data files
				1191	and all meta data files.
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1192
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1193	-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1194
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1195	Input may also be defined using BSD glob wildcards.
				1196
				1197	-i 'file/news*.zip'
				1198
				1199	The extended input array will be sorted in length order, so the shortest
				1200	path needs to contain all primary data files and all meta data files.
				1201
Akron	0c3e375	2016-06-28 15:55:53 +0200	[diff] [blame]	1202	(The directory structure follows the base directory format,
				1203	that may include a C<.> root folder.
				1204	In this case further archives lacking a C<.> root folder
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1205	need to be passed with a hash sign in front of the archive's name.
				1206	This may require to quote the parameter.)
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1207
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1208	To support zip files, a version of C<unzip> needs to be installed that is
				1209	compatible with the archive file.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	1210
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1211	B<The root folder switch using the hash sign is experimental and
				1212	may vanish in future versions.>
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	1213
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1214
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1215	=item B<--input-base\|-ib> <directory>
				1216
				1217	The base directory for inputs.
				1218
				1219
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1220	=item B<--output\|-o> <directory\|file>
				1221
				1222	Output folder for archive processing or
				1223	document name for single output (optional),
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1224	writes to C<STDOUT> by default
				1225	(in case C<output> is not mandatory due to further options).
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1226
				1227	=item B<--overwrite\|-w>
				1228
				1229	Overwrite files that already exist.
				1230
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1231
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1232	=item B<--token\|-t> <foundry>#<file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1233
				1234	Define the default tokenization by specifying
				1235	the name of the foundry and optionally the name
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1236	of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1237	This will directly take the file instead of running
				1238	the layer implementation!
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1239
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1240
				1241	=item B<--base-sentences\|-bs> <foundry>#<layer>
				1242
				1243	Define the layer for base sentences.
				1244	If given, this will be used instead of using C<Base#Sentences>.
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	1245	Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
				1246	layers supported.
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1247
				1248	Defaults to unset.
				1249
				1250
				1251	=item B<--base-paragraphs\|-bp> <foundry>#<layer>
				1252
				1253	Define the layer for base paragraphs.
				1254	If given, this will be used instead of using C<Base#Paragraphs>.
				1255	Currently C<DeReKo#Structure> is the only additional layer supported.
				1256
				1257	Defaults to unset.
				1258
				1259
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	1260	=item B<--base-pagebreaks\|-bpb> <foundry>#<layer>
				1261
				1262	Define the layer for base pagebreaks.
				1263	Currently C<DeReKo#Structure> is the only layer supported.
				1264
				1265	Defaults to unset.
				1266
				1267
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1268	=item B<--skip\|-s> <foundry>[#<layer>]
				1269
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1270	Skip specific annotations by specifying the foundry
				1271	(and optionally the layer with a C<#>-prefix),
				1272	e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1273	Can be set multiple times.
				1274
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1275
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1276	=item B<--anno\|-a> <foundry>#<layer>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1277
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1278	Convert specific annotations by specifying the foundry
				1279	(and optionally the layer with a C<#>-prefix),
				1280	e.g. C<Mate> or C<Mate#Morpho>.
				1281	Can be set multiple times.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1282
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1283
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1284	=item B<--primary\|-p>
				1285
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1286	Output primary data or not. Defaults to C<true>.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1287	Can be flagged using C<--no-primary> as well.
				1288	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1289
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1290
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	1291	=item B<--non-word-tokens\|-nwt>
				1292
				1293	Tokenize non-word tokens like word tokens (defined as matching
				1294	C</[\d\w]/>). Useful to treat punctuations as tokens.
				1295
				1296	Defaults to unset.
				1297
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1298
				1299	=item B<--non-verbal-tokens\|-nvt>
				1300
				1301	Tokenize non-verbal tokens marked as in the primary data as
				1302	the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
				1303
				1304	Defaults to unset.
				1305
				1306
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1307	=item B<--jobs\|-j>
				1308
				1309	Define the number of concurrent jobs in seperated forks
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1310	for archive processing.
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1311	Defaults to C<0> (everything runs in a single process).
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1312
				1313	If C<sequential-extraction> is not set to false, this will
				1314	also apply to extraction.
				1315
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	1316	Pass -1, and the value will be set automatically to 5
				1317	times the number of available cores.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1318	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1319
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1320
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	1321	=item B<--koral\|-k>
				1322
				1323	Version of the output format. Supported versions are:
				1324	C<0> for legacy serialization, C<0.03> for serialization
				1325	with metadata fields as key-values on the root object,
				1326	C<0.4> for serialization with metadata fields as a list
				1327	of C<"@type":"koral:field"> objects.
				1328
				1329	Currently defaults to C<0.03>.
				1330
				1331
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1332	=item B<--sequential-extraction\|-se>
				1333
				1334	Flag to indicate, if the C<jobs> value also applies to extraction.
				1335	Some systems may have problems with extracting multiple archives
				1336	to the same folder at the same time.
				1337	Can be flagged using C<--no-sequential-extraction> as well.
				1338	Defaults to C<false>.
				1339
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1340
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1341	=item B<--meta\|-m>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1342
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1343	Define the metadata parser to use. Defaults to C<I5>.
				1344	Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
				1345	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1346
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1347
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1348	=item B<--pretty\|-y>
				1349
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1350	Pretty print JSON output. Defaults to C<false>.
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1351	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1352
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1353
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1354	=item B<--gzip\|-z>
				1355
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1356	Compress the output.
				1357	Expects a defined C<output> file in single processing.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1358
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1359
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1360	=item B<--cache\|-c>
				1361
				1362	File to mmap a cache (using L<Cache::FastMmap>).
				1363	Defaults to C<korapxml2krill.cache> in the calling directory.
				1364
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1365
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1366	=item B<--cache-size\|-cs>
				1367
				1368	Size of the cache. Defaults to C<50m>.
				1369
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1370
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1371	=item B<--cache-init\|-ci>
				1372
				1373	Initialize cache file.
				1374	Can be flagged using C<--no-cache-init> as well.
				1375	Defaults to C<true>.
				1376
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1377
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1378	=item B<--cache-delete\|-cd>
				1379
				1380	Delete cache file after processing.
				1381	Can be flagged using C<--no-cache-delete> as well.
				1382	Defaults to C<true>.
				1383
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1384
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1385	=item B<--config\|-cfg>
				1386
				1387	Configure the parameters of your call in a file
				1388	of key-value pairs with whitespace separator
				1389
				1390	overwrite 1
				1391	token DeReKo#Structure
				1392	...
				1393
				1394	Supported parameters are:
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1395	C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1396	C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1397	C<output>, C<koral>,
				1398	C<tempary-extract>, C<sequential-extraction>,
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1399	C<base-sentences>, C<base-paragraphs>,
				1400	C<base-pagebreaks>,
				1401	C<skip> (semicolon separated), C<sigle>
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1402	(semicolon separated), C<anno> (semicolon separated).
				1403
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1404	Configuration parameters will always be overwritten by
				1405	passed parameters.
				1406
				1407
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1408	=item B<--temporary-extract\|-te>
				1409
				1410	Only valid for the C<archive> command.
				1411
				1412	This will first extract all files into a
				1413	directory and then will archive.
				1414	If the directory is given as C<:temp:>,
				1415	a temporary directory is used.
				1416	This is especially useful to avoid
				1417	massive unzipping and potential
				1418	network latency.
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1419
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1420
Akron	c93a080	2019-07-11 15:48:34 +0200	[diff] [blame]	1421	=item B<--to-tar>
				1422
				1423	Only valid for the C<archive> command.
				1424
				1425	Writes the output into a tar archive.
				1426
				1427
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1428	=item B<--sigle\|-sg>
				1429
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1430	Extract the given texts.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1431	Can be set multiple times.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1432	I<Currently only supported on C<extract>.>
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	1433	Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1434	In case the C<Text> path is omitted, the whole document will be extracted.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1435	On the document level, the postfix wildcard C<*> is supported.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1436
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1437
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1438	=item B<--log\|-l>
				1439
				1440	The L<Log4perl> log level, defaults to C<ERROR>.
				1441
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1442
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1443	=item B<--help\|-h>
				1444
Akron	42f48c1	2020-02-14 13:08:13 +0100	[diff] [blame]	1445	Print help information.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1446
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1447
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1448	=item B<--version\|-v>
				1449
				1450	Print version information.
				1451
				1452	=back
				1453
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1454
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1455	=head1 ANNOTATION SUPPORT
				1456
				1457	L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
				1458	developed in the KorAP project that are part of the KorAP preprocessing pipeline.
				1459	The base foundry with paragraphs, sentences, and the text element are mandatory for
				1460	L<Krill\|https://github.com/KorAP/Krill>.
				1461
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1462	Base
				1463	#Paragraphs
				1464	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1465
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1466	Connexor
				1467	#Morpho
				1468	#Phrase
				1469	#Sentences
				1470	#Syntax
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1471
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1472	CoreNLP
				1473	#Constituency
				1474	#Morpho
				1475	#NamedEntities
				1476	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1477
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	1478	CMC
				1479	#Morpho
				1480
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1481	DeReKo
				1482	#Structure
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1483
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	1484	DGD
				1485	#Morpho
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	1486	#Structure
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	1487
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1488	DRuKoLa
				1489	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1490
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1491	Glemm
				1492	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1493
Akron	ea1aed5	2018-07-19 14:43:34 +0200	[diff] [blame]	1494	HNC
				1495	#Morpho
				1496
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	1497	LWC
				1498	#Dependency
				1499
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1500	Malt
				1501	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1502
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1503	MarMoT
				1504	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1505
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1506	Mate
				1507	#Dependency
				1508	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1509
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1510	MDParser
				1511	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1512
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1513	OpenNLP
				1514	#Morpho
				1515	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1516
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame]	1517	RWK
				1518	#Morpho
				1519	#Structure
				1520
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1521	Sgbr
				1522	#Lemma
				1523	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1524
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	1525	Talismane
				1526	#Dependency
				1527	#Morpho
				1528
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1529	TreeTagger
				1530	#Morpho
				1531	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1532
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1533	XIP
				1534	#Constituency
				1535	#Morpho
				1536	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1537
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1538
				1539	More importers are in preparation.
				1540	New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
				1541	See the built-in annotation importers as examples.
				1542
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1543
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1544	=head1 About KorAP-XML
				1545
				1546	KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
				1547	data model (Bański et al. 2013), where text data are stored physically
				1548	separated from their interpretations (i.e. annotations).
				1549	A text document in KorAP-XML therefore consists of several files
				1550	containing primary data, metadata and annotations.
				1551
				1552	The structure of a single KorAP-XML document can be as follows:
				1553
				1554	- data.xml
				1555	- header.xml
				1556	+ base
				1557	- tokens.xml
				1558	- ...
				1559	+ struct
				1560	- structure.xml
				1561	- ...
				1562	+ corenlp
				1563	- morpho.xml
				1564	- constituency.xml
				1565	- ...
				1566	+ tree_tagger
				1567	- morpho.xml
				1568	- ...
				1569	- ...
				1570
				1571	The C<data.xml> contains the primary data, the C<header.xml> contains
				1572	the metadata, and the annotation layers are stored in subfolders
				1573	like C<base>, C<struct> or C<corenlp>
				1574	(so-called "foundries"; Bański et al. 2013).
				1575
				1576	Metadata is available in the TEI-P5 variant I5
Akron	d4c5c10	2020-02-11 11:47:59 +0100	[diff] [blame]	1577	(Lüngen and Sperberg-McQueen 2012). See the documentation in
				1578	L<KorAP::XML::Meta::I5> for translatable fields.
				1579
				1580	Annotations correspond to a variant of the TEI-P5 feature structures
				1581	(TEI Consortium; Lee et al. 2004).
Akron	72bc522	2020-02-06 16:00:13 +0100	[diff] [blame]	1582	Annotation feature structures refer to character sequences of the primary text
				1583	inside the C<text> element of the C<data.xml>.
				1584	A single annotation containing the lemma of a token can have the following structure:
				1585
				1586	<span from="0" to="3">
				1587	<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
				1588	<f name="lex">
				1589	<fs>
				1590	<f name="lemma">zum</f>
				1591	</fs>
				1592	</f>
				1593	</fs>
				1594	</span>
				1595
				1596	The C<from> and C<to> attributes are refering to the character span
				1597	in the primary text.
				1598	Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
				1599	the structure may vary. See L<KorAP::XML::Annotation::*> for various
				1600	annotation preprocessors.
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1601
				1602	Multiple KorAP-XML documents are organized on three levels following
				1603	the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
				1604	corpus E<gt> document E<gt> text. On each level metadata information
				1605	can be stored, that C<korapxml2krill> will merge to a single metadata
				1606	object per text. A corpus is therefore structured as follows:
				1607
				1608	+ <corpus>
				1609	- header.xml
				1610	+ <document>
				1611	- header.xml
				1612	+ <text>
				1613	- data.xml
				1614	- header.xml
				1615	- ...
				1616	- ...
				1617
				1618	A single text can be identified by the concatenation of
				1619	the corpus identifier, the document identifier and the text identifier.
				1620	This identifier is called the text sigle
				1621	(e.g. a text with the identifier C<18486> in the document C<060> in the
				1622	corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
				1623
				1624	These corpora are often stored in zip files, with which C<korapxml2krill>
				1625	can deal with. Corpora may also be split in multiple zip archives
				1626	(e.g. one zip file per foundry), which is also supported (see C<--input>).
				1627
				1628	Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
				1629	in form of a test suite.
				1630	The resulting JSON format merges all annotation layers
				1631	based on a single token stream.
				1632
				1633	=head2 References
				1634
				1635	Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
				1636	KorAP data model: first approximation, December.
				1637
				1638	Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
				1639	"The New IDS Corpus Analysis Platform: Challenges and Prospects",
				1640	Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
				1641	L<PDF\|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
				1642
				1643	Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
				1644	"Robust corpus architecture: a new look at virtual collections and data access",
				1645	Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
				1646	L<PDF\|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
				1647
				1648	Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
				1649	Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
				1650	"Towards an international standard on featurestructure representation",
				1651	Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
				1652	pp. 373-376.
				1653	L<PDF\|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
				1654
				1655	Harald Lüngen and C. M. Sperberg-McQueen (2012):
				1656	"A TEI P5 Document Grammar for the IDS Text Model",
				1657	Journal of the Text Encoding Initiative, Issue 3 \| November 2012.
				1658	L<PDF\|https://journals.openedition.org/jtei/pdf/508>
				1659
				1660	TEI Consortium, eds:
				1661	"Feature Structures",
				1662	Guidelines for Electronic Text Encoding and Interchange.
				1663	L<html\|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
				1664
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1665	=head1 AVAILABILITY
				1666
				1667	https://github.com/KorAP/KorAP-XML-Krill
				1668
				1669
				1670	=head1 COPYRIGHT AND LICENSE
				1671
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1672	Copyright (C) 2015-2020, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1673
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1674	Author: L<Nils Diewald\|https://nils-diewald.de/>
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1675
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1676	Contributor: Eliza Margaretha
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1677
				1678	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				1679	Corpus Analysis Platform at the
Akron	94262ce	2019-02-28 21:42:43 +0100	[diff] [blame]	1680	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1681	member of the
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1682	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1683
				1684	This program is free software published under the
				1685	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				1686
				1687	=cut