Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: 54edaa8285037ce350ded320c45b20c26bd7ab6e [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	4	use FindBin;
				5	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				6	use File::Spec::Functions qw/catfile catdir/;
				7	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	8	use Benchmark qw/:hireswallclock/;
				9	use IO::Compress::Gzip qw/$GzipError/;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	10	use POSIX qw/ceil/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	11	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	12	use Pod::Usage;
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	13	use Cache::FastMmap;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	14	use Directory::Iterator;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	15	use KorAP::XML::Krill;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	16	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	17	use KorAP::XML::Tokenizer;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	18	use KorAP::XML::Batch::File;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	19	use Config::Simple;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	20	use Parallel::ForkManager;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	21	use v5.10;
				22	use Sys::Info;
				23	use Sys::Info::Constants qw( :device_cpu );
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	24	use File::Glob ':bsd_glob';
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	25	use File::Temp qw/tempdir/;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	26	use File::Path qw(remove_tree make_path);
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	27	use File::Basename;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	28	use Mojo::Collection 'c';
				29	use String::Random qw(random_string);
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	30	use IO::File;
				31	use Archive::Tar::Builder;
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	32	use Fcntl qw(:flock SEEK_END);
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	33
				34	# use KorAP::XML::ForkPool;
Akron	75ba57d	2016-03-07 23:36:27 +0100	[diff] [blame]	35	# TODO: use Parallel::Loops
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	36	# TODO: make output files
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	37
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	38	# TODO: Use KorAP::XML::ForkPool!
				39
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	40	# CHANGES:
				41	# ----------------------------------------------------------
				42	# 2013/11/25
				43	# - Initial release
				44	#
				45	# 2014/10/29
				46	# - Merges foundry data to create indexer friendly documents
				47	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	48	# 2016/02/04
				49	# - renamed to korapxml2krill
				50	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	51	#
				52	# 2016/02/12
				53	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	54	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	55	#
				56	# 2016/02/14
				57	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	58	# - Added support for archive files
				59	#
				60	# 2016/02/15
				61	# - Fixed temporary directory bug
				62	# - Improved skipping before unzipping
				63	# - Added EXPERIMENTAL concurrency support
				64	#
				65	# 2016/02/23
				66	# - Merge korapxml2krill and korapxml2krill_dir
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	67	#
				68	# 2016/02/27
				69	# - Added extract function
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	70	#
				71	# 2016/03/17
				72	# - Added meta switch
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	73	#
				74	# 2016/03/18
				75	# - Added meta data caching
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	76	#
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	77	# 2016/06/27
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	78	# - Added multi archive support
				79	# - Added prefix negation support
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	80	# - Added Malt#Dependency support
Akron	8b99052	2016-07-06 16:45:57 +0200	[diff] [blame]	81	#
				82	# 2016/07/06
				83	# - Added MDParser#Dependency
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	84	#
				85	# 2016/10/15
Nils Diewald	0e48977	2016-10-24 15:16:52 +0200	[diff] [blame]	86	# - Fixed temporary path issue in script
				87	#
				88	# 2016/10/24
				89	# - Improved Windows support
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	90	#
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	91	# 2016/10/24
				92	# - Added support for document extraction
				93	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	94	# 2016/10/27
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	95	# - Added wildcard support for document extraction
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	96	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	97	# 2016/12/21
				98	# - added support for base-sentences and base-tokenizations
				99	#
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	100	# 2017/01/20
				101	# - added support for DRuKoLa annotations
				102	#
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	103	# 2017/02/08
				104	# - added support for pagebreak annotations
				105	#
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	106	# 2017/04/06
				107	# - added support for wildcards in input
				108	#
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	109	# 2017/04/07
				110	# - support configuration option
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	111	# - support for temporary extraction
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	112	#
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	113	# 2017/04/12
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	114	# - support serial processing
				115	# - support input root
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	116	# - introduced --sequential-extraction flag
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	117	#
				118	# 2017/06/19
				119	# - added support for DCK
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	120	#
				121	# 2017/06/29
				122	# - Fixed exit codes
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	123	#
				124	# 2017/07/04
				125	# - Fixed tar building process
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	126	#
				127	# 2018/01/16
				128	# - Added LWC support
Akron	5fdc7e1	2018-07-19 12:37:48 +0200	[diff] [blame]	129	#
				130	# 2018/07/19
				131	# - Preliminary support for HNC.
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	132	#
				133	# 2019/01/22
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	134	# - Preliminary support for DGD.
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	135	# - Support for non-word tokens.
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	136	#
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	137	# 2019/02/13
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	138	# - Support for 'koral:field' array.
				139	# - Support for Koral versioning.
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	140	# - Ignore temporary extract parameter on
				141	# directory archiving.
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	142	#
				143	# 2019/08/08
				144	# - Support for Talismane.
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	145	#
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	146	# 2019/12/17
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	147	# - Added support for DGD pseudo-sentences
				148	# based on anchor milestones.
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	149	# - Support for non-verbal annotations.
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame^]	150	#
				151	# 2020/04/23
				152	# - Added support for Redewiedergabe-Korpus structure
				153	# annotations, based on sentence and paragraph milestones
				154	# - Added support for Redewiedergabe-Korpus morphology
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	155	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	156
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame^]	157	our $LAST_CHANGE = '2020/04/23';
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	158	our $LOCAL = $FindBin::Bin;
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	159	our $KORAL_VERSION = 0.03;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	160	our $VERSION_MSG = <<"VERSION";
				161	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				162	VERSION
				163
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	164	# Prototypes
				165	sub get_file_name_from_glob($);
				166	sub get_file_name($);
				167
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	168	# Parse comand
				169	my $cmd;
				170	our @ARGV;
				171	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				172	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	173	};
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	174	my @keep_argv = @ARGV;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	175
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	176	my (@skip, @sigle, @anno, @input);
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	177	my $text;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	178
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	179	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	180	GetOptions(
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	181	'input\|i=s' => \@input,
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	182	'input-base\|ib=s' => \(my $input_base),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	183	'output\|o=s' => \(my $output),
				184	'overwrite\|w' => \(my $overwrite),
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	185	'meta\|m=s' => \(my $meta),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	186	'token\|t=s' => \(my $token_base),
				187	'base-sentences\|bs=s' => \(my $base_sentences),
				188	'base-paragraphs\|bp=s' => \(my $base_paragraphs),
				189	'base-pagebreaks\|bpb=s' => \(my $base_pagebreaks),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	190	'gzip\|z' => \(my $gzip),
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	191	'temporary-extract\|te=s' => \(my $extract_dir),
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	192	'skip\|s=s' => \@skip,
				193	'sigle\|sg=s' => \@sigle,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	194	'cache\|c=s' => \(my $cache_file),
				195	'config\|cfg=s' => \(my $cfg_file),
				196	'log\|l=s' => \(my $log_level),
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	197	'anno\|a=s' => \@anno,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	198	'primary\|p!' => \(my $primary),
				199	'pretty\|y' => \(my $pretty),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	200	'jobs\|j=i' => \(my $jobs),
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	201	'koral\|k=f' => \(my $koral),
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	202	'to-tar' => \(my $to_tar),
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	203	'non-word-tokens\|nwt' => \(my $non_word_tokens),
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	204	'non-verbal-tokens\|nvt' => \(my $non_verbal_tokens),
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	205	'sequential-extraction\|se' => \(my $sequential_extraction),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	206	'cache-size\|cs=s' => \(my $cache_size),
				207	'cache-delete\|cd!' => \(my $cache_delete),
				208	'cache-init\|ci!' => \(my $cache_init),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	209	'help\|h' => sub {
				210	pod2usage(
				211	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	212	-verbose => 99,
				213	-msg => $VERSION_MSG,
				214	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	215	);
				216	},
				217	'version\|v' => sub {
				218	pod2usage(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	219	-verbose => 0,
				220	-msg => $VERSION_MSG,
				221	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	222	)
				223	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	224	);
				225
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	226
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	227	# Load from configuration
				228	if ($cfg_file && -e $cfg_file) {
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	229	my %config;
				230
				231	Config::Simple->import_from($cfg_file, \%config);
				232
				233	# Overwrite
				234	if (!defined($overwrite) && defined $config{overwrite}) {
				235	$overwrite = $config{overwrite};
				236	};
				237
				238	# Gzip
				239	if (!defined($gzip) && defined $config{gzip}) {
				240	$gzip = $config{gzip};
				241	};
				242
				243	# Jobs
				244	if (!defined($jobs) && defined $config{jobs}) {
				245	$jobs = $config{jobs};
				246	};
				247
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	248	# Koral version
				249	if (!defined($koral) && defined $config{koral}) {
				250	$koral = $config{koral};
				251	};
				252
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	253	# Input root base directory
				254	if (!defined($input_base) && defined $config{'input-base'}) {
				255	$input_base = $config{'input-base'};
				256	};
				257
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	258	# temporary-extract
				259	if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
				260	$extract_dir = $config{'temporary-extract'};
				261	};
				262
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	263	# Token base
				264	if (!defined($token_base) && defined $config{token}) {
				265	$token_base = $config{token};
				266	};
				267
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	268	# Non-word tokenization
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	269	if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
				270	$non_word_tokens = $config{'non-word-tokens'};
				271	};
				272
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	273	# Non-verbal tokenization
				274	if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
				275	$non_verbal_tokens = $config{'non-verbal-tokens'};
				276	};
				277
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	278	# Cache file
				279	if (!defined($cache_file) && defined $config{cache}) {
				280	$cache_file = $config{cache};
				281	};
				282
				283	# Cache size
				284	if (!defined($cache_size) && defined $config{'cache-size'}) {
				285	$cache_size = $config{'cache-size'};
				286	};
				287
				288	# Cache delete
				289	if (!defined($cache_delete) && defined $config{'cache-delete'}) {
				290	$cache_delete = $config{'cache-delete'} ;
				291	};
				292
				293	# Cache init
				294	if (!(defined $cache_init) && defined $config{'cache-init'}) {
				295	$cache_init = $config{'cache-init'} ;
				296	};
				297
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	298	# Jobs for extraction
				299	if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
				300	$sequential_extraction = $config{'sequential-extraction'} ;
				301	};
				302
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	303	# Meta
				304	if (!(defined $meta) && defined $config{'meta'}) {
				305	$meta = $config{'meta'} ;
				306	};
				307
				308	# Output
				309	if (!(defined $output) && defined $config{'output'}) {
				310	$output = $config{'output'} ;
				311	};
				312
				313	# Base-sentences
				314	if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
				315	$base_sentences = $config{'base-sentences'} ;
				316	};
				317
				318	# Base-paragraphs
				319	if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
				320	$base_paragraphs = $config{'base-paragraphs'} ;
				321	};
				322
				323	# Base-pagebreaks
				324	if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
				325	$base_pagebreaks = $config{'base-pagebreaks'} ;
				326	};
				327
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	328	# Write to tar
				329	if (!(defined $to_tar) && defined $config{'to-tar'}) {
				330	$to_tar = $config{'to-tar'} ;
				331	};
				332
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	333	# Log
				334	if (!(defined $log_level) && defined $config{'log'}) {
				335	$log_level = $config{'log'} ;
				336	};
				337
				338	# Skip
				339	if (!scalar(@skip) && defined $config{'skip'}) {
				340	@skip = split /\s;\s/, $config{'skip'} ;
				341	};
				342
				343	# Sigle
				344	if (!scalar(@sigle) && defined $config{'sigle'}) {
				345	@sigle = split /\s;\s/, $config{'sigle'} ;
				346	};
				347
				348	# Anno
				349	if (!scalar(@anno) && defined $config{'anno'}) {
				350	@anno = split /\s;\s/, $config{'anno'} ;
				351	};
				352	};
				353
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	354
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	355	# Set default token base
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	356	$token_base //= 'OpenNLP#tokens';
				357	$cache_file //= 'korapxml2krill.cache';
				358	$cache_size //= '50m';
				359	$jobs //= 0;
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	360	$koral //= $KORAL_VERSION;
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	361	$cache_delete //= 1;
				362	$cache_init //= 1;
				363	$sequential_extraction //= 0;
				364	$log_level //= 'ERROR';
				365	$base_sentences //= '';
				366	$base_paragraphs //= '';
				367	$base_pagebreaks //= '';
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	368	$non_word_tokens //= 0;
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	369	$non_verbal_tokens //= 0;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	370
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	371	$base_sentences = lc $base_sentences;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	372	$base_paragraphs = lc $base_paragraphs;
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	373	$base_pagebreaks = lc $base_pagebreaks;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	374
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	375
				376	# Initialize log4perl object
				377	Log::Log4perl->init({
				378	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
				379	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				380	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				381	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				382	});
				383
				384	my $log = Log::Log4perl->get_logger('main');
				385
				386
				387	print "Reading config from $cfg_file\n" if $cfg_file;
				388
				389
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	390	my %ERROR_HASH = (
				391	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	392	-verbose => 99,
				393	-msg => $VERSION_MSG,
				394	-output => '-',
				395	-exit => 1
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	396	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	397
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	398	# Input has to be defined
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	399	pod2usage(%ERROR_HASH) unless @input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	400
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	401	# Gzip has no effect, if no output is given
				402	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	403
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	404
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	405	if ($jobs eq '-1') {
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	406	state $cores = Sys::Info->new->device('CPU')->count;
				407	$jobs = ceil(5 * $cores);
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	408	$log->info("Run using $jobs jobs on $cores cores");
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	409	};
				410
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	411
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	412	# Start serial processing
Akron	28c4e54	2017-07-04 20:30:33 +0200	[diff] [blame]	413	if ($cmd && $cmd eq 'serial') {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	414
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	415	if ($output && (!defined($to_tar)) && (!-e $output \|\| !-d $output)) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	416	$log->error("Directory '$output' does not exist.");
				417	exit 1;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	418	};
				419
				420	# Remove all inputs
				421	my $remove_next = 0;
				422	@keep_argv = @{c(@keep_argv)->grep(
				423	sub {
				424	# Input flag
				425	if ($_ eq '-i' \|\| $_ eq '--input' \|\| $_ eq '--output' \|\| $_ eq '-o') {
				426	$remove_next = 1;
				427	return 0;
				428	}
				429
				430	# input value
				431	elsif ($remove_next) {
				432	$remove_next = 0;
				433	return 0;
				434	};
				435
				436	# Pass parameter
				437	return 1;
				438	}
				439	)->to_array};
				440
				441
				442	# Iterate over all inputs
				443	foreach (@input) {
				444
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	445	# This will create a directory
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	446	my $new_out = catdir($output, get_file_name_from_glob($_));
				447
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	448	# Create new path, in case the output is not meant to be tarred
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	449	unless ($to_tar) {
				450	if (make_path($new_out) == 0 && !-d $new_out) {
				451	$log->error("Can\'t create path $new_out");
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	452	exit 1;
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	453	};
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	454	};
				455
				456	# Create archive command
				457	my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
				458	print "Start serial processing of $_ to $new_out\n";
				459
				460	# Start archiving
				461	system @archive_cmd;
				462	};
				463
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	464	exit;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	465	};
				466
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	467	my %skip;
				468	$skip{lc($_)} = 1 foreach @skip;
				469
				470	my @layers;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	471	push(@layers, ['Base', 'Sentences']) unless $base_sentences;
				472	push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	473
				474	# Connexor
				475	push(@layers, ['Connexor', 'Morpho']);
				476	push(@layers, ['Connexor', 'Syntax']);
				477	push(@layers, ['Connexor', 'Phrase']);
				478	push(@layers, ['Connexor', 'Sentences']);
				479
				480	# CoreNLP
				481	push(@layers, ['CoreNLP', 'NamedEntities']);
				482	push(@layers, ['CoreNLP', 'Sentences']);
				483	push(@layers, ['CoreNLP', 'Morpho']);
				484	push(@layers, ['CoreNLP', 'Constituency']);
				485
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	486	# CMC
				487	push(@layers, ['CMC', 'Morpho']);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	488
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	489	# DeReKo
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	490	my @dereko_attr = ();
				491	if ($base_sentences eq 'dereko#structure') {
				492	push @dereko_attr, 'sentences';
				493	};
				494	if ($base_paragraphs eq 'dereko#structure') {
				495	push @dereko_attr, 'paragraphs';
				496	};
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	497
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	498	if ($base_pagebreaks eq 'dereko#structure') {
				499	push @dereko_attr, 'pagebreaks';
				500	};
				501
				502	if ($dereko_attr[0]) {
				503	push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	504	}
				505	else {
				506	push(@layers, ['DeReKo', 'Structure']);
				507	};
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	508
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	509	# DGD
				510	push(@layers, ['DGD', 'Morpho']);
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	511	if ($base_sentences eq 'dgd#structure') {
				512	push(@layers, ['DGD', 'Structure', 'base-sentence']);
				513	}
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	514
				515	# DRuKoLa
				516	push(@layers, ['DRuKoLa', 'Morpho']);
				517
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	518	# Glemm
				519	push(@layers, ['Glemm', 'Morpho']);
				520
Akron	ea1aed5	2018-07-19 14:43:34 +0200	[diff] [blame]	521	# HNC
				522	push(@layers, ['HNC', 'Morpho']);
				523
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	524	# LWC
				525	push(@layers, ['LWC', 'Dependency']);
				526
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	527	# Malt
				528	push(@layers, ['Malt', 'Dependency']);
				529
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	530	# Marmot
				531	push(@layers, ['MarMoT', 'Morpho']);
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	532
				533	# Mate
				534	push(@layers, ['Mate', 'Morpho']);
				535	push(@layers, ['Mate', 'Dependency']);
				536
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	537	# MDParser
				538	push(@layers, ['MDParser', 'Dependency']);
				539
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	540	# OpenNLP
				541	push(@layers, ['OpenNLP', 'Morpho']);
				542	push(@layers, ['OpenNLP', 'Sentences']);
				543
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame^]	544	# Redewiedergabe
				545	push(@layers, ['RWK', 'Morpho']);
				546	if ($base_sentences eq 'rwk#structure') {
				547	push(@layers, ['RWK', 'Structure']);
				548	};
				549
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	550	# Schreibgebrauch
				551	push(@layers, ['Sgbr', 'Lemma']);
				552	push(@layers, ['Sgbr', 'Morpho']);
				553
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	554	# Talismane
				555	push(@layers, ['Talismane', 'Dependency']);
				556	push(@layers, ['Talismane', 'Morpho']);
				557
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	558	# TreeTagger
				559	push(@layers, ['TreeTagger', 'Morpho']);
				560	push(@layers, ['TreeTagger', 'Sentences']);
				561
				562	# XIP
				563	push(@layers, ['XIP', 'Morpho']);
				564	push(@layers, ['XIP', 'Constituency']);
				565	push(@layers, ['XIP', 'Sentences']);
				566	push(@layers, ['XIP', 'Dependency']);
				567
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	568
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	569	# Check filters
				570	my @filtered_anno;
				571	if ($skip{'#all'}) {
				572	foreach (@anno) {
				573	push @filtered_anno, [ split('#', $_) ];
				574	};
				575	}
				576
				577	# Add all annotations that are not skipped
				578	else {
				579	# Add to index file - respect skipping
				580	foreach my $info (@layers) {
				581	# Skip if Foundry or Foundry#Layer should be skipped
				582	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				583	push @filtered_anno, $info;
				584	};
				585	};
				586	};
				587
				588	# Get tokenization basis
Akron	3c56f50	2017-10-24 15:37:27 +0200	[diff] [blame]	589	my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
				590
				591	# Remove file extension
				592	$token_base_layer =~ s/\.xml$//i;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	593
				594	# TODO: This should not be initialized for batch
				595	my $cache = Cache::FastMmap->new(
				596	share_file => $cache_file,
				597	cache_size => $cache_size,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	598	init_file => $cache_init
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	599	);
				600
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	601	# Create batch object
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	602	my $batch_file = KorAP::XML::Batch::File->new(
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	603	cache => $cache,
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	604	meta_type => $meta,
				605	overwrite => $overwrite,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	606	foundry => $token_base_foundry,
				607	layer => $token_base_layer,
				608	gzip => $gzip,
				609	log => $log,
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	610	koral => $koral,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	611	primary => $primary,
				612	pretty => $pretty,
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	613	anno => \@filtered_anno,
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	614	non_word_tokens => $non_word_tokens,
				615	non_verbal_tokens => $non_verbal_tokens
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	616	);
				617
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	618	# Get file name based on path information
				619	sub get_file_name ($) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	620	my $i = $input[0];
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	621	if (-d $i) {
				622	$i =~ s![^\/]+$!!;
				623	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	624	my $file = shift;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	625
				626	# Remove temp dir fragments
Akron	6255760	2016-06-27 14:10:13 +0200	[diff] [blame]	627	$file =~ s!^/?tmp/[^/]+!!;
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	628	$file =~ s/^?\/?$i//;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	629	$file =~ tr/\//-/;
				630	$file =~ s{^-+}{};
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	631	$file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	632	return $file;
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	633	};
				634
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	635
				636	sub get_file_name_from_glob ($) {
				637	my $glob = shift;
Akron	bd3adda	2017-04-11 15:00:55 +0200	[diff] [blame]	638	$glob =~ s![\\\/]!-!g; # Transform paths
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	639	$glob =~ s/[\*\?]//g; # Remove arbitrary fills
				640	$glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
				641	$glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
				642	$glob =~ s/^-//; # Clean beginning
				643	$glob =~ s/-$//; # Clean end
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	644	$glob =~ s/\.zip$//; # Remove file extension
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	645	return $glob;
				646	};
				647
				648
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	649	# Convert sigle to path construct
				650	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;
				651
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	652	if ($cmd) {
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	653	if ($output && (!defined($to_tar)) && (!-e $output \|\| !-d $output)) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	654	$log->error("Directory '$output' does not exist.");
				655	exit 1;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	656	};
				657	};
				658
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	659
				660	# Glob and prefix files
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	661	if (@input) {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	662
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	663	my @new_input = ();
				664
				665	# Iterate over all inputs
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	666	foreach my $wild_card (@input) {
				667
				668	# Prefix with input root
				669	$wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
				670
				671	push (@new_input, bsd_glob($wild_card));
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	672	};
				673
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	674	# Sort files by length
				675	@input = sort { length($a) <=> length($b) } @new_input;
				676
				677	print 'Input is ' . join(', ', @input)."\n";
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	678	};
				679
				680
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	681	# Process a single file
				682	unless ($cmd) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	683	my $input = $input[0];
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	684
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	685	BEGIN {
				686	$main::TIME = Benchmark->new;
				687	$main::LAST_STOP = Benchmark->new;
				688	};
				689
				690	sub stop_time {
				691	my $new = Benchmark->new;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	692	$log->info(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	693	'The code took: '.
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	694	timestr(timediff($new, $main::LAST_STOP)) .
				695	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				696	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	697	$main::LAST_STOP = $new;
				698	};
				699
				700	# Create and parse new document
				701	$input =~ s{([^/])$}{$1/};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	702
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	703	# Process file
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	704	$batch_file->process($input, $output);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	705
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	706	# Delete cache file
				707	unlink($cache_file) if $cache_delete;
				708
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	709	stop_time;
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	710	exit;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	711	};
				712
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	713
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	714	# Extract XML files
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	715	if ($cmd eq 'extract') {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	716
Akron	d5643ad	2017-07-04 20:27:13 +0200	[diff] [blame]	717	# Output is required
				718	pod2usage(%ERROR_HASH) unless $output;
				719
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	720	# Create new archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	721	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	722
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	723	# Check zip capabilities
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	724	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	725	$log->error("Unzip is not installed or incompatible.");
				726	exit 1;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	727	};
				728
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	729	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	730	$archive->attach($_) foreach @input[1..$#input];
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	731
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	732	# Will set @sigle
				733	my $prefix = set_sigle($archive);
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	734
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	735	# Iterate over all given sigles and extract
				736	foreach (@sigle) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	737
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	738	print "$_ ...\n";
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	739
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	740	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	741	print '... ' . (
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	742
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	743	# TODO:
				744	# - prefix???
				745	$archive->extract_sigle([$_], $output, $jobs)
				746	? '' : 'not '
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	747	);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	748	print "extracted.\n";
				749	};
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	750	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	751
				752	# Can't create archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	753	else {
				754	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	755	exit 1;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	756	};
				757	}
				758
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	759
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	760	# Process an archive
				761	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	762
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	763	my $archive_output;
				764
				765	# First extract, then archive
Akron	63d03ee	2019-02-13 18:49:38 +0100	[diff] [blame]	766	if (defined $extract_dir && !-d $input[0]) {
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	767
				768	# Create new archive object
				769	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
				770
				771	# Check zip capabilities
				772	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	773	$log->error("Unzip is not installed or incompatible.");
				774	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	775	};
				776
				777	# Add further annotation archived
				778	$archive->attach($_) foreach @input[1..$#input];
				779
				780	# Create a temporary directory
				781	if ($extract_dir eq ':temp:') {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	782	$extract_dir = tempdir(CLEANUP => 0);
				783	print "Temporarily extract to $extract_dir\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	784	};
				785
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	786	# Add some random extra to avoid clashes with multiple archives
				787	$extract_dir = catdir($extract_dir, random_string('cccccc'));
				788
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	789	# Extract to temporary directory
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	790	if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	791	@input = ($extract_dir);
				792	}
				793	else {
				794	$log->error('Unable to extract from primary archive ' . $input[0] .
				795	' to ' . $extract_dir);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	796	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	797	};
				798	}
				799
				800	# Can't create archive object
				801	else {
				802	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	803	exit 1;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	804	};
				805	};
				806
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	807	# Zero means: everything runs in the parent process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	808	my $pool = Parallel::ForkManager->new($jobs);
				809
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	810	my $count = 0; # Texts to process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	811	my $iter = 1; # Current text in process
				812
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	813	my $tar_archive;
				814	my $output_dir = $output;
				815	my $tar_fh;
				816
				817	# Initialize tar archive
				818	if ($to_tar) {
				819	$tar_archive = Archive::Tar::Builder->new(
				820	ignore_errors => 1
				821	);
				822
				823	# Set output name
				824	my $tar_file = $output;
				825	unless ($tar_file =~ /\.tar$/) {
				826	$tar_file .= '.tar';
				827	};
				828
				829	# Initiate the tar file
				830	print "Writing to file $tar_file\n";
				831	$tar_fh = IO::File->new($tar_file, 'w');
				832	$tar_fh->binmode(1);
				833
				834	# Set handle
				835	$tar_archive->set_handle($tar_fh);
				836
				837	# Output to temporary directory
				838	$output_dir = File::Temp->newdir;
				839	};
				840
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	841	# Report on fork message
				842	$pool->run_on_finish (
				843	sub {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	844	my ($pid, $code) = @_;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	845	my $data = pop;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	846
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	847	print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	848	($iter++) . "/$count]" .
				849	($code ? " $code" : '') .
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	850	' ' . $data->[0] . "\n";
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	851
				852	if (!$code && $to_tar && $data->[2]) {
				853	my $filename = $data->[2];
				854
				855	# Lock filehandle
				856	if (flock($tar_fh, LOCK_EX)) {
				857
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	858	my $clean_file = fileparse($filename);
				859
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	860	# Archive and remove file
Akron	9a062ce	2017-07-04 19:12:05 +0200	[diff] [blame]	861	$tar_archive->archive_as($filename => $clean_file);
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	862	unlink $filename;
				863
				864	# Unlock filehandle
				865	flock($tar_fh, LOCK_UN);
				866	}
				867	else {
				868	$log->warn("Unable to add $filename to archive");
				869	};
				870	};
				871
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	872	$data->[1] = undef if $data->[1];
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	873	}
				874	);
				875
				876	my $t;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	877	my $temp;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	878	print "Reading data ...\n";
				879
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	880	# unless (Cache::FastMmap->new(
				881	# share_file => $cache_file,
				882	# cache_size => $cache_size,
				883	# init_file => $cache_init
				884	# )) {
				885	# print "Unable to intialize cache '$cache_file'\n\n";
				886	# exit(1);
				887	# };
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	888
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	889
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	890	# Input is a directory
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	891	if (-d $input[0]) {
				892	my $it = Directory::Iterator->new($input[0]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	893	my @dirs;
				894	my $dir;
				895
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	896	# Todo: Make a DO WHILE
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	897	while (1) {
				898	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	899	push @dirs, $dir;
				900	$it->prune;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	901	};
				902	last unless $it->next;
				903	};
				904
				905	print "Start processing ...\n";
				906	$t = Benchmark->new;
				907	$count = scalar @dirs;
				908
				909	DIRECTORY_LOOP:
				910	for (my $i = 0; $i < $count; $i++) {
				911
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	912	my $filename = catfile(
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	913	$output_dir,
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	914	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	915	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	916
				917	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	918	$pool->start and next DIRECTORY_LOOP;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	919
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	920	if (my $return = $batch_file->process($dirs[$i] => $filename)) {
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	921	$pool->finish(
				922	0,
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	923	[
				924	"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
				925	undef,
				926	$filename
				927	]
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	928	);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	929	}
				930	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	931	$pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	932	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	933	};
				934	}
				935
				936	# Input is a file
Akron	29866ac	2016-06-24 16:40:47 +0200	[diff] [blame]	937	elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	938
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	939	unless ($archive->test_unzip) {
Akron	3abc03e	2017-06-29 16:23:35 +0200	[diff] [blame]	940	$log->error("Unzip is not installed or incompatible.");
				941	exit 1;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	942	};
				943
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	944	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	945	$archive->attach($_) foreach @input[1..$#input];
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	946
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	947	# Get sigles to extract
				948	my $prefix = set_sigle($archive);
				949
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	950	print "Start processing ...\n";
				951	$t = Benchmark->new;
				952	my @dirs = $archive->list_texts;
				953	$count = scalar @dirs;
				954
				955	ARCHIVE_LOOP:
				956	for (my $i = 0; $i < $count; $i++) {
				957
				958	# Split path information
				959	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				960
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	961	my $filename = catfile(
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	962	$output_dir,
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	963	get_file_name(
				964	catfile($corpus, $doc, $text)
				965	. '.json' . ($gzip ? '.gz' : '')
				966	)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	967	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	968
				969	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	970	$pool->start and next ARCHIVE_LOOP;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	971
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	972	# Create temporary file
				973	$temp = File::Temp->newdir;
				974
Akron	bdf434a	2016-10-24 17:42:07 +0200	[diff] [blame]	975	# TODO: Check if $filename exist at the beginning,
				976	# because extraction can be horrible slow!
				977
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	978	# Extract from archive
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	979	if ($archive->extract_sigle([join('/', $corpus, $doc, $text)], $temp, $sequential_extraction ? 1 : $jobs)) {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	980
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	981	# Create corpus directory
				982	my $input = catdir("$temp", $corpus);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	983
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	984	# Temporary directory
				985	my $dir = catdir($input, $doc, $text);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	986
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	987	# Write file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	988	if (my $return = $batch_file->process($dir => $filename)) {
Akron	486f9ab	2017-04-22 23:25:19 +0200	[diff] [blame]	989
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	990	# Delete temporary file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	991	$pool->finish(
				992	0,
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	993	[
				994	"Processed " . $filename . ($return == -1 ? " - already existing" : ''),
				995	$temp,
				996	$filename
				997	]
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	998	);
				999	#$pool->finish(0, ["Processed " . $filename, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	1000	}
				1001	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	1002	# Delete temporary file
				1003	$pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	1004	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1005	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	1006
				1007	# Unable to extract
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1008	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	1009	$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1010	};
				1011	};
				1012	}
				1013
				1014	else {
				1015	print "Input is neither a directory nor an archive.\n\n";
				1016	};
				1017
				1018	$pool->wait_all_children;
				1019
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1020	# Delete cache file
				1021	unlink($cache_file) if $cache_delete;
				1022
Akron	da3097e	2017-04-23 19:53:57 +0200	[diff] [blame]	1023	# Close tar filehandle
				1024	if ($to_tar && $tar_fh) {
				1025	$tar_archive->finish;
				1026	$tar_fh->close;
				1027	print "Wrote to tar archive.\n";
				1028	};
				1029
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1030	print timestr(timediff(Benchmark->new, $t))."\n";
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1031	print "Done.\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1032	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1033
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1034
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1035	# For an archive, this will create the list
				1036	# of all sigles to process
				1037	sub set_sigle {
				1038	my $archive = shift;
				1039
				1040	my $prefix = 1;
				1041	my @dirs = ();
				1042
				1043	# No sigles given
				1044	unless (@sigle) {
				1045
				1046	# Get files
				1047	foreach ($archive->list_texts) {
				1048
				1049	push @dirs, $_;
				1050
				1051	# Split path information
				1052	($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
				1053
				1054	# TODO: Make this OS independent
				1055	push @sigle, join '/', $corpus, $doc, $text;
				1056	};
				1057	}
				1058
				1059	# Check sigle for doc sigles
				1060	else {
				1061	my @new_sigle;
				1062
				1063	my $prefix_check = 0;
				1064
				1065	# Iterate over all sigle
				1066	foreach (@sigle) {
				1067
				1068	# Sigle is a doc sigle
				1069	if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
				1070
				1071	print "$_ ...";
				1072	# Check if a prefix is needed
				1073	unless ($prefix_check) {
				1074
				1075	if ($prefix = $archive->check_prefix) {
				1076	print " with prefix ...";
				1077	};
				1078	$prefix_check = 1;
				1079	};
				1080
				1081	print "\n";
				1082
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1083	print '... ' . (
Akron	955b75b	2019-02-21 14:28:41 +0100	[diff] [blame]	1084	$archive->extract_sigle([$_], $output, $sequential_extraction ? 1 : $jobs)
				1085	? '' : 'not '
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1086	);
				1087	print "extracted.\n";
				1088	}
				1089
				1090	# Sigle is a text sigle
				1091	else {
				1092	push @new_sigle, $_;
				1093
				1094	unless ($prefix_check) {
				1095
				1096	if ($prefix = $archive->check_prefix) {
				1097	print " with prefix ...";
				1098	};
				1099	$prefix_check = 1;
				1100	};
				1101	};
				1102	};
				1103	@sigle = @new_sigle;
				1104	};
				1105
				1106	return $prefix;
				1107	};
				1108
				1109
				1110
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1111	# Cleanup temporary extraction directory
				1112	if ($extract_dir) {
				1113	my $objects = remove_tree($extract_dir, { safe => 1 });
				1114	print "Removed directory $extract_dir with $objects objects.\n";
				1115	};
				1116
				1117
				1118	print "\n";
				1119
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1120	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1121
				1122	=pod
				1123
				1124	=encoding utf8
				1125
				1126	=head1 NAME
				1127
Akron	42f48c1	2020-02-14 13:08:13 +0100	[diff] [blame]	1128	korapxml2krill - Merge KorAP-XML data and create Krill documents
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1129
				1130
				1131	=head1 SYNOPSIS
				1132
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1133	korapxml2krill [archive\|extract] --input <directory\|archive> [options]
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1134
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1135
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1136	=head1 DESCRIPTION
				1137
				1138	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				1139	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1140	The C<korapxml2krill> command line tool is a simple wrapper of this library.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1141
				1142
				1143	=head1 INSTALLATION
				1144
				1145	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				1146
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	1147	$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1148
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1149	In case everything went well, the C<korapxml2krill> tool will
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1150	be available on your command line immediately.
Akron	6eff23b	2018-09-24 10:31:20 +0200	[diff] [blame]	1151	Minimum requirement for L<KorAP::XML::Krill> is Perl 5.16.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	1152	In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1153
				1154	=head1 ARGUMENTS
				1155
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1156	$ korapxml2krill -z --input <directory> --output <filename>
				1157
				1158	Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1159	It expects the input to point to the text level folder.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1160
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1161	=over 2
				1162
				1163	=item B<archive>
				1164
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	1165	$ korapxml2krill archive -z --input <directory\|archive> --output <directory\|tar>
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1166
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1167	Converts an archive of KorAP-XML documents. It expects a directory
				1168	(pointing to the corpus level folder) or one or more zip files as input.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1169
				1170	=item B<extract>
				1171
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1172	$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
				1173
				1174	Extracts KorAP-XML documents from a zip file.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1175
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1176	=item B<serial>
				1177
				1178	$ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
				1179
				1180	Convert archives sequentially. The inputs are not merged but treated
				1181	as they are (so they may be premerged or globs).
				1182	the C<--out> directory is treated as the base directory where subdirectories
Akron	081639e	2017-04-21 19:01:39 +0200	[diff] [blame]	1183	are created based on the archive name. In case the C<--to-tar> flag is given,
				1184	the output will be a tar file.
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1185
				1186
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1187	=back
				1188
				1189
				1190	=head1 OPTIONS
				1191
				1192	=over 2
				1193
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1194	=item B<--input\|-i> <directory\|zip file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1195
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1196	Directory or zip file(s) of documents to convert.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1197
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1198	Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akron	f1a1de9	2016-11-02 17:32:12 +0100	[diff] [blame]	1199	document, while C<archive> expects a KorAP-XML corpus folder or a zip
				1200	file to batch process multiple files.
				1201	C<extract> expects zip files only.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1202
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1203	C<archive> supports multiple input zip files with the constraint,
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1204	that the first archive listed contains all primary data files
				1205	and all meta data files.
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1206
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1207	-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1208
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1209	Input may also be defined using BSD glob wildcards.
				1210
				1211	-i 'file/news*.zip'
				1212
				1213	The extended input array will be sorted in length order, so the shortest
				1214	path needs to contain all primary data files and all meta data files.
				1215
Akron	0c3e375	2016-06-28 15:55:53 +0200	[diff] [blame]	1216	(The directory structure follows the base directory format,
				1217	that may include a C<.> root folder.
				1218	In this case further archives lacking a C<.> root folder
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1219	need to be passed with a hash sign in front of the archive's name.
				1220	This may require to quote the parameter.)
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1221
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1222	To support zip files, a version of C<unzip> needs to be installed that is
				1223	compatible with the archive file.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	1224
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1225	B<The root folder switch using the hash sign is experimental and
				1226	may vanish in future versions.>
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	1227
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1228
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1229	=item B<--input-base\|-ib> <directory>
				1230
				1231	The base directory for inputs.
				1232
				1233
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1234	=item B<--output\|-o> <directory\|file>
				1235
				1236	Output folder for archive processing or
				1237	document name for single output (optional),
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1238	writes to C<STDOUT> by default
				1239	(in case C<output> is not mandatory due to further options).
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1240
				1241	=item B<--overwrite\|-w>
				1242
				1243	Overwrite files that already exist.
				1244
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1245
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1246	=item B<--token\|-t> <foundry>#<file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1247
				1248	Define the default tokenization by specifying
				1249	the name of the foundry and optionally the name
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1250	of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1251	This will directly take the file instead of running
				1252	the layer implementation!
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1253
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1254
				1255	=item B<--base-sentences\|-bs> <foundry>#<layer>
				1256
				1257	Define the layer for base sentences.
				1258	If given, this will be used instead of using C<Base#Sentences>.
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	1259	Currently C<DeReKo#Structure> and C<DGD#Structure> are the only additional
				1260	layers supported.
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1261
				1262	Defaults to unset.
				1263
				1264
				1265	=item B<--base-paragraphs\|-bp> <foundry>#<layer>
				1266
				1267	Define the layer for base paragraphs.
				1268	If given, this will be used instead of using C<Base#Paragraphs>.
				1269	Currently C<DeReKo#Structure> is the only additional layer supported.
				1270
				1271	Defaults to unset.
				1272
				1273
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	1274	=item B<--base-pagebreaks\|-bpb> <foundry>#<layer>
				1275
				1276	Define the layer for base pagebreaks.
				1277	Currently C<DeReKo#Structure> is the only layer supported.
				1278
				1279	Defaults to unset.
				1280
				1281
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1282	=item B<--skip\|-s> <foundry>[#<layer>]
				1283
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1284	Skip specific annotations by specifying the foundry
				1285	(and optionally the layer with a C<#>-prefix),
				1286	e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1287	Can be set multiple times.
				1288
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1289
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1290	=item B<--anno\|-a> <foundry>#<layer>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1291
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1292	Convert specific annotations by specifying the foundry
				1293	(and optionally the layer with a C<#>-prefix),
				1294	e.g. C<Mate> or C<Mate#Morpho>.
				1295	Can be set multiple times.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1296
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1297
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1298	=item B<--primary\|-p>
				1299
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1300	Output primary data or not. Defaults to C<true>.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1301	Can be flagged using C<--no-primary> as well.
				1302	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1303
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1304
Akron	ed9baf0	2019-01-22 17:03:25 +0100	[diff] [blame]	1305	=item B<--non-word-tokens\|-nwt>
				1306
				1307	Tokenize non-word tokens like word tokens (defined as matching
				1308	C</[\d\w]/>). Useful to treat punctuations as tokens.
				1309
				1310	Defaults to unset.
				1311
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1312
				1313	=item B<--non-verbal-tokens\|-nvt>
				1314
				1315	Tokenize non-verbal tokens marked as in the primary data as
				1316	the unicode symbol 'Black Vertical Rectangle' aka \x25ae.
				1317
				1318	Defaults to unset.
				1319
				1320
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1321	=item B<--jobs\|-j>
				1322
				1323	Define the number of concurrent jobs in seperated forks
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1324	for archive processing.
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1325	Defaults to C<0> (everything runs in a single process).
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1326
				1327	If C<sequential-extraction> is not set to false, this will
				1328	also apply to extraction.
				1329
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	1330	Pass -1, and the value will be set automatically to 5
				1331	times the number of available cores.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1332	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1333
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1334
Akron	263274c	2019-02-07 09:48:30 +0100	[diff] [blame]	1335	=item B<--koral\|-k>
				1336
				1337	Version of the output format. Supported versions are:
				1338	C<0> for legacy serialization, C<0.03> for serialization
				1339	with metadata fields as key-values on the root object,
				1340	C<0.4> for serialization with metadata fields as a list
				1341	of C<"@type":"koral:field"> objects.
				1342
				1343	Currently defaults to C<0.03>.
				1344
				1345
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1346	=item B<--sequential-extraction\|-se>
				1347
				1348	Flag to indicate, if the C<jobs> value also applies to extraction.
				1349	Some systems may have problems with extracting multiple archives
				1350	to the same folder at the same time.
				1351	Can be flagged using C<--no-sequential-extraction> as well.
				1352	Defaults to C<false>.
				1353
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1354
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1355	=item B<--meta\|-m>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1356
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1357	Define the metadata parser to use. Defaults to C<I5>.
				1358	Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
				1359	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1360
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1361
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1362	=item B<--pretty\|-y>
				1363
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1364	Pretty print JSON output. Defaults to C<false>.
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1365	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1366
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1367
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1368	=item B<--gzip\|-z>
				1369
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1370	Compress the output.
				1371	Expects a defined C<output> file in single processing.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1372
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1373
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1374	=item B<--cache\|-c>
				1375
				1376	File to mmap a cache (using L<Cache::FastMmap>).
				1377	Defaults to C<korapxml2krill.cache> in the calling directory.
				1378
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1379
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1380	=item B<--cache-size\|-cs>
				1381
				1382	Size of the cache. Defaults to C<50m>.
				1383
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1384
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1385	=item B<--cache-init\|-ci>
				1386
				1387	Initialize cache file.
				1388	Can be flagged using C<--no-cache-init> as well.
				1389	Defaults to C<true>.
				1390
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1391
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1392	=item B<--cache-delete\|-cd>
				1393
				1394	Delete cache file after processing.
				1395	Can be flagged using C<--no-cache-delete> as well.
				1396	Defaults to C<true>.
				1397
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1398
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1399	=item B<--config\|-cfg>
				1400
				1401	Configure the parameters of your call in a file
				1402	of key-value pairs with whitespace separator
				1403
				1404	overwrite 1
				1405	token DeReKo#Structure
				1406	...
				1407
				1408	Supported parameters are:
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1409	C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1410	C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron	31a08cb	2019-02-20 20:43:26 +0100	[diff] [blame]	1411	C<output>, C<koral>,
				1412	C<tempary-extract>, C<sequential-extraction>,
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame]	1413	C<base-sentences>, C<base-paragraphs>,
				1414	C<base-pagebreaks>,
				1415	C<skip> (semicolon separated), C<sigle>
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1416	(semicolon separated), C<anno> (semicolon separated).
				1417
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1418	Configuration parameters will always be overwritten by
				1419	passed parameters.
				1420
				1421
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1422	=item B<--temporary-extract\|-te>
				1423
				1424	Only valid for the C<archive> command.
				1425
				1426	This will first extract all files into a
				1427	directory and then will archive.
				1428	If the directory is given as C<:temp:>,
				1429	a temporary directory is used.
				1430	This is especially useful to avoid
				1431	massive unzipping and potential
				1432	network latency.
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1433
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1434
Akron	c93a080	2019-07-11 15:48:34 +0200	[diff] [blame]	1435	=item B<--to-tar>
				1436
				1437	Only valid for the C<archive> command.
				1438
				1439	Writes the output into a tar archive.
				1440
				1441
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1442	=item B<--sigle\|-sg>
				1443
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1444	Extract the given texts.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1445	Can be set multiple times.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1446	I<Currently only supported on C<extract>.>
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	1447	Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1448	In case the C<Text> path is omitted, the whole document will be extracted.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1449	On the document level, the postfix wildcard C<*> is supported.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1450
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1451
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1452	=item B<--log\|-l>
				1453
				1454	The L<Log4perl> log level, defaults to C<ERROR>.
				1455
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1456
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1457	=item B<--help\|-h>
				1458
Akron	42f48c1	2020-02-14 13:08:13 +0100	[diff] [blame]	1459	Print help information.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1460
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1461
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1462	=item B<--version\|-v>
				1463
				1464	Print version information.
				1465
				1466	=back
				1467
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1468
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1469	=head1 ANNOTATION SUPPORT
				1470
				1471	L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
				1472	developed in the KorAP project that are part of the KorAP preprocessing pipeline.
				1473	The base foundry with paragraphs, sentences, and the text element are mandatory for
				1474	L<Krill\|https://github.com/KorAP/Krill>.
				1475
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1476	Base
				1477	#Paragraphs
				1478	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1479
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1480	Connexor
				1481	#Morpho
				1482	#Phrase
				1483	#Sentences
				1484	#Syntax
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1485
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1486	CoreNLP
				1487	#Constituency
				1488	#Morpho
				1489	#NamedEntities
				1490	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1491
Akron	ce125b6	2017-06-19 11:54:36 +0200	[diff] [blame]	1492	CMC
				1493	#Morpho
				1494
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1495	DeReKo
				1496	#Structure
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1497
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	1498	DGD
				1499	#Morpho
Akron	c29b8e1	2019-12-16 14:28:09 +0100	[diff] [blame]	1500	#Structure
Akron	57510c1	2019-01-04 14:58:53 +0100	[diff] [blame]	1501
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1502	DRuKoLa
				1503	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1504
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1505	Glemm
				1506	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1507
Akron	ea1aed5	2018-07-19 14:43:34 +0200	[diff] [blame]	1508	HNC
				1509	#Morpho
				1510
Akron	4c67919	2018-01-16 17:41:49 +0100	[diff] [blame]	1511	LWC
				1512	#Dependency
				1513
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1514	Malt
				1515	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1516
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1517	MarMoT
				1518	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1519
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1520	Mate
				1521	#Dependency
				1522	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1523
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1524	MDParser
				1525	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1526
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1527	OpenNLP
				1528	#Morpho
				1529	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1530
Akron	07e2477	2020-04-23 14:00:54 +0200	[diff] [blame^]	1531	RWK
				1532	#Morpho
				1533	#Structure
				1534
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1535	Sgbr
				1536	#Lemma
				1537	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1538
Akron	7d5e638	2019-08-08 16:36:27 +0200	[diff] [blame]	1539	Talismane
				1540	#Dependency
				1541	#Morpho
				1542
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1543	TreeTagger
				1544	#Morpho
				1545	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1546
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1547	XIP
				1548	#Constituency
				1549	#Morpho
				1550	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1551
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1552
				1553	More importers are in preparation.
				1554	New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
				1555	See the built-in annotation importers as examples.
				1556
Akron	f73ffb6	2018-06-27 12:13:59 +0200	[diff] [blame]	1557
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1558	=head1 About KorAP-XML
				1559
				1560	KorAP-XML (Bański et al. 2012) is an implementation of the KorAP
				1561	data model (Bański et al. 2013), where text data are stored physically
				1562	separated from their interpretations (i.e. annotations).
				1563	A text document in KorAP-XML therefore consists of several files
				1564	containing primary data, metadata and annotations.
				1565
				1566	The structure of a single KorAP-XML document can be as follows:
				1567
				1568	- data.xml
				1569	- header.xml
				1570	+ base
				1571	- tokens.xml
				1572	- ...
				1573	+ struct
				1574	- structure.xml
				1575	- ...
				1576	+ corenlp
				1577	- morpho.xml
				1578	- constituency.xml
				1579	- ...
				1580	+ tree_tagger
				1581	- morpho.xml
				1582	- ...
				1583	- ...
				1584
				1585	The C<data.xml> contains the primary data, the C<header.xml> contains
				1586	the metadata, and the annotation layers are stored in subfolders
				1587	like C<base>, C<struct> or C<corenlp>
				1588	(so-called "foundries"; Bański et al. 2013).
				1589
				1590	Metadata is available in the TEI-P5 variant I5
Akron	d4c5c10	2020-02-11 11:47:59 +0100	[diff] [blame]	1591	(Lüngen and Sperberg-McQueen 2012). See the documentation in
				1592	L<KorAP::XML::Meta::I5> for translatable fields.
				1593
				1594	Annotations correspond to a variant of the TEI-P5 feature structures
				1595	(TEI Consortium; Lee et al. 2004).
Akron	72bc522	2020-02-06 16:00:13 +0100	[diff] [blame]	1596	Annotation feature structures refer to character sequences of the primary text
				1597	inside the C<text> element of the C<data.xml>.
				1598	A single annotation containing the lemma of a token can have the following structure:
				1599
				1600	<span from="0" to="3">
				1601	<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
				1602	<f name="lex">
				1603	<fs>
				1604	<f name="lemma">zum</f>
				1605	</fs>
				1606	</f>
				1607	</fs>
				1608	</span>
				1609
				1610	The C<from> and C<to> attributes are refering to the character span
				1611	in the primary text.
				1612	Depending on the kind of annotation (e.g. token-based, span-based, relation-based),
				1613	the structure may vary. See L<KorAP::XML::Annotation::*> for various
				1614	annotation preprocessors.
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1615
				1616	Multiple KorAP-XML documents are organized on three levels following
				1617	the "IDS Textmodell" (Lüngen and Sperberg-McQueen 2012):
				1618	corpus E<gt> document E<gt> text. On each level metadata information
				1619	can be stored, that C<korapxml2krill> will merge to a single metadata
				1620	object per text. A corpus is therefore structured as follows:
				1621
				1622	+ <corpus>
				1623	- header.xml
				1624	+ <document>
				1625	- header.xml
				1626	+ <text>
				1627	- data.xml
				1628	- header.xml
				1629	- ...
				1630	- ...
				1631
				1632	A single text can be identified by the concatenation of
				1633	the corpus identifier, the document identifier and the text identifier.
				1634	This identifier is called the text sigle
				1635	(e.g. a text with the identifier C<18486> in the document C<060> in the
				1636	corpus C<WPD17> has the text sigle C<WPD17/060/18486>, see C<--sigle>).
				1637
				1638	These corpora are often stored in zip files, with which C<korapxml2krill>
				1639	can deal with. Corpora may also be split in multiple zip archives
				1640	(e.g. one zip file per foundry), which is also supported (see C<--input>).
				1641
				1642	Examples for KorAP-XML files are included in L<KorAP::XML::Krill>
				1643	in form of a test suite.
				1644	The resulting JSON format merges all annotation layers
				1645	based on a single token stream.
				1646
				1647	=head2 References
				1648
				1649	Piotr Bański, Cyril Belica, Helge Krause, Marc Kupietz, Carsten Schnober, Oliver Schonefeld, and Andreas Witt (2011):
				1650	KorAP data model: first approximation, December.
				1651
				1652	Piotr Bański, Peter M. Fischer, Elena Frick, Erik Ketzan, Marc Kupietz, Carsten Schnober, Oliver Schonefeld and Andreas Witt (2012):
				1653	"The New IDS Corpus Analysis Platform: Challenges and Prospects",
				1654	Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC 2012).
				1655	L<PDF\|http://www.lrec-conf.org/proceedings/lrec2012/pdf/789_Paper.pdf>
				1656
				1657	Piotr Bański, Elena Frick, Michael Hanl, Marc Kupietz, Carsten Schnober and Andreas Witt (2013):
				1658	"Robust corpus architecture: a new look at virtual collections and data access",
				1659	Corpus Linguistics 2013. Abstract Book. Lancaster: UCREL, pp. 23-25.
				1660	L<PDF\|https://ids-pub.bsz-bw.de/frontdoor/deliver/index/docId/4485/file/Ba%c5%84ski_Frick_Hanl_Robust_corpus_architecture_2013.pdf>
				1661
				1662	Kiyong Lee, Lou Burnard, Laurent Romary, Eric de la Clergerie, Thierry Declerck,
				1663	Syd Bauman, Harry Bunt, Lionel Clément, Tomaz Erjavec, Azim Roussanaly and Claude Roux (2004):
				1664	"Towards an international standard on featurestructure representation",
				1665	Proceedings of the fourth International Conference on Language Resources and Evaluation (LREC 2004),
				1666	pp. 373-376.
				1667	L<PDF\|http://www.lrec-conf.org/proceedings/lrec2004/pdf/687.pdf>
				1668
				1669	Harald Lüngen and C. M. Sperberg-McQueen (2012):
				1670	"A TEI P5 Document Grammar for the IDS Text Model",
				1671	Journal of the Text Encoding Initiative, Issue 3 \| November 2012.
				1672	L<PDF\|https://journals.openedition.org/jtei/pdf/508>
				1673
				1674	TEI Consortium, eds:
				1675	"Feature Structures",
				1676	Guidelines for Electronic Text Encoding and Interchange.
				1677	L<html\|https://www.tei-c.org/release/doc/tei-p5-doc/en/html/FS.html>
				1678
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1679	=head1 AVAILABILITY
				1680
				1681	https://github.com/KorAP/KorAP-XML-Krill
				1682
				1683
				1684	=head1 COPYRIGHT AND LICENSE
				1685
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1686	Copyright (C) 2015-2020, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1687
Akron	8f69d63	2020-01-15 16:58:11 +0100	[diff] [blame]	1688	Author: L<Nils Diewald\|https://nils-diewald.de/>
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1689
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1690	Contributor: Eliza Margaretha
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1691
				1692	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				1693	Corpus Analysis Platform at the
Akron	94262ce	2019-02-28 21:42:43 +0100	[diff] [blame]	1694	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1695	member of the
Akron	f1849aa	2019-12-16 23:35:33 +0100	[diff] [blame]	1696	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1697
				1698	This program is free software published under the
				1699	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				1700
				1701	=cut