Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: 1b994c239d91c34badd39d1111af192f03d9b331 [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	4	use FindBin;
				5	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				6	use File::Spec::Functions qw/catfile catdir/;
				7	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	8	use Benchmark qw/:hireswallclock/;
				9	use IO::Compress::Gzip qw/$GzipError/;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	10	use POSIX qw/ceil/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	11	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	12	use Pod::Usage;
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	13	use Cache::FastMmap;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	14	use Directory::Iterator;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	15	use KorAP::XML::Krill;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	16	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	17	use KorAP::XML::Tokenizer;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	18	use KorAP::XML::Batch::File;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	19	use Config::Simple;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	20	use Parallel::ForkManager;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	21	use v5.10;
				22	use Sys::Info;
				23	use Sys::Info::Constants qw( :device_cpu );
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	24	use File::Glob ':bsd_glob';
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	25	use File::Temp qw/tempdir/;
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	26	use File::Path qw(remove_tree make_path);
				27	use Mojo::Collection 'c';
				28	use String::Random qw(random_string);
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	29
				30	# use KorAP::XML::ForkPool;
Akron	75ba57d	2016-03-07 23:36:27 +0100	[diff] [blame]	31	# TODO: use Parallel::Loops
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	32	# TODO: make output files
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	33
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	34	# TODO: Use KorAP::XML::ForkPool!
				35
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	36	# CHANGES:
				37	# ----------------------------------------------------------
				38	# 2013/11/25
				39	# - Initial release
				40	#
				41	# 2014/10/29
				42	# - Merges foundry data to create indexer friendly documents
				43	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	44	# 2016/02/04
				45	# - renamed to korapxml2krill
				46	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	47	#
				48	# 2016/02/12
				49	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	50	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	51	#
				52	# 2016/02/14
				53	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	54	# - Added support for archive files
				55	#
				56	# 2016/02/15
				57	# - Fixed temporary directory bug
				58	# - Improved skipping before unzipping
				59	# - Added EXPERIMENTAL concurrency support
				60	#
				61	# 2016/02/23
				62	# - Merge korapxml2krill and korapxml2krill_dir
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	63	#
				64	# 2016/02/27
				65	# - Added extract function
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	66	#
				67	# 2016/03/17
				68	# - Added meta switch
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	69	#
				70	# 2016/03/18
				71	# - Added meta data caching
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	72	#
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	73	# 2016/06/27
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	74	# - Added multi archive support
				75	# - Added prefix negation support
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	76	# - Added Malt#Dependency support
Akron	8b99052	2016-07-06 16:45:57 +0200	[diff] [blame]	77	#
				78	# 2016/07/06
				79	# - Added MDParser#Dependency
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	80	#
				81	# 2016/10/15
Nils Diewald	0e48977	2016-10-24 15:16:52 +0200	[diff] [blame]	82	# - Fixed temporary path issue in script
				83	#
				84	# 2016/10/24
				85	# - Improved Windows support
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	86	#
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	87	# 2016/10/24
				88	# - Added support for document extraction
				89	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	90	# 2016/10/27
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	91	# - Added wildcard support for document extraction
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	92	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	93	# 2016/12/21
				94	# - added support for base-sentences and base-tokenizations
				95	#
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	96	# 2017/01/20
				97	# - added support for DRuKoLa annotations
				98	#
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	99	# 2017/02/08
				100	# - added support for pagebreak annotations
				101	#
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	102	# 2017/04/06
				103	# - added support for wildcards in input
				104	#
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	105	# 2017/04/07
				106	# - support configuration option
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	107	# - support for temporary extraction
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	108	#
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	109	# 2017/04/12
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	110	# - support serial processing
				111	# - support input root
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	112	# - introduced --sequential-extraction flag
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	113	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	114
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	115	our $LAST_CHANGE = '2017/04/12';
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	116	our $LOCAL = $FindBin::Bin;
				117	our $VERSION_MSG = <<"VERSION";
				118	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				119	VERSION
				120
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	121	# Prototypes
				122	sub get_file_name_from_glob($);
				123	sub get_file_name($);
				124
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	125	# Parse comand
				126	my $cmd;
				127	our @ARGV;
				128	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				129	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	130	};
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	131	my @keep_argv = @ARGV;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	132
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	133	my (@skip, @sigle, @anno, @input);
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	134	my $text;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	135
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	136	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	137	GetOptions(
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	138	'input\|i=s' => \@input,
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	139	'input-base\|ib=s' => \(my $input_base),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	140	'output\|o=s' => \(my $output),
				141	'overwrite\|w' => \(my $overwrite),
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	142	'meta\|m=s' => \(my $meta),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	143	'token\|t=s' => \(my $token_base),
				144	'base-sentences\|bs=s' => \(my $base_sentences),
				145	'base-paragraphs\|bp=s' => \(my $base_paragraphs),
				146	'base-pagebreaks\|bpb=s' => \(my $base_pagebreaks),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	147	'gzip\|z' => \(my $gzip),
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	148	'temporary-extract\|te=s' => \(my $extract_dir),
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	149	'skip\|s=s' => \@skip,
				150	'sigle\|sg=s' => \@sigle,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	151	'cache\|c=s' => \(my $cache_file),
				152	'config\|cfg=s' => \(my $cfg_file),
				153	'log\|l=s' => \(my $log_level),
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	154	'anno\|a=s' => \@anno,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	155	'primary\|p!' => \(my $primary),
				156	'pretty\|y' => \(my $pretty),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	157	'jobs\|j=i' => \(my $jobs),
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	158	'sequential-extraction\|se' => \(my $sequential_extraction),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	159	'cache-size\|cs=s' => \(my $cache_size),
				160	'cache-delete\|cd!' => \(my $cache_delete),
				161	'cache-init\|ci!' => \(my $cache_init),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	162	'help\|h' => sub {
				163	pod2usage(
				164	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	165	-verbose => 99,
				166	-msg => $VERSION_MSG,
				167	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	168	);
				169	},
				170	'version\|v' => sub {
				171	pod2usage(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	172	-verbose => 0,
				173	-msg => $VERSION_MSG,
				174	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	175	)
				176	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	177	);
				178
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	179
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	180	# Load from configuration
				181	if ($cfg_file && -e $cfg_file) {
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	182	my %config;
				183
				184	Config::Simple->import_from($cfg_file, \%config);
				185
				186	# Overwrite
				187	if (!defined($overwrite) && defined $config{overwrite}) {
				188	$overwrite = $config{overwrite};
				189	};
				190
				191	# Gzip
				192	if (!defined($gzip) && defined $config{gzip}) {
				193	$gzip = $config{gzip};
				194	};
				195
				196	# Jobs
				197	if (!defined($jobs) && defined $config{jobs}) {
				198	$jobs = $config{jobs};
				199	};
				200
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	201	# Input root base directory
				202	if (!defined($input_base) && defined $config{'input-base'}) {
				203	$input_base = $config{'input-base'};
				204	};
				205
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	206	# temporary-extract
				207	if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
				208	$extract_dir = $config{'temporary-extract'};
				209	};
				210
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	211	# Token base
				212	if (!defined($token_base) && defined $config{token}) {
				213	$token_base = $config{token};
				214	};
				215
				216	# Cache file
				217	if (!defined($cache_file) && defined $config{cache}) {
				218	$cache_file = $config{cache};
				219	};
				220
				221	# Cache size
				222	if (!defined($cache_size) && defined $config{'cache-size'}) {
				223	$cache_size = $config{'cache-size'};
				224	};
				225
				226	# Cache delete
				227	if (!defined($cache_delete) && defined $config{'cache-delete'}) {
				228	$cache_delete = $config{'cache-delete'} ;
				229	};
				230
				231	# Cache init
				232	if (!(defined $cache_init) && defined $config{'cache-init'}) {
				233	$cache_init = $config{'cache-init'} ;
				234	};
				235
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	236	# Jobs for extraction
				237	if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
				238	$sequential_extraction = $config{'sequential-extraction'} ;
				239	};
				240
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	241	# Meta
				242	if (!(defined $meta) && defined $config{'meta'}) {
				243	$meta = $config{'meta'} ;
				244	};
				245
				246	# Output
				247	if (!(defined $output) && defined $config{'output'}) {
				248	$output = $config{'output'} ;
				249	};
				250
				251	# Base-sentences
				252	if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
				253	$base_sentences = $config{'base-sentences'} ;
				254	};
				255
				256	# Base-paragraphs
				257	if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
				258	$base_paragraphs = $config{'base-paragraphs'} ;
				259	};
				260
				261	# Base-pagebreaks
				262	if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
				263	$base_pagebreaks = $config{'base-pagebreaks'} ;
				264	};
				265
				266	# Log
				267	if (!(defined $log_level) && defined $config{'log'}) {
				268	$log_level = $config{'log'} ;
				269	};
				270
				271	# Skip
				272	if (!scalar(@skip) && defined $config{'skip'}) {
				273	@skip = split /\s;\s/, $config{'skip'} ;
				274	};
				275
				276	# Sigle
				277	if (!scalar(@sigle) && defined $config{'sigle'}) {
				278	@sigle = split /\s;\s/, $config{'sigle'} ;
				279	};
				280
				281	# Anno
				282	if (!scalar(@anno) && defined $config{'anno'}) {
				283	@anno = split /\s;\s/, $config{'anno'} ;
				284	};
				285	};
				286
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	287
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	288	# Set default token base
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	289	$token_base //= 'OpenNLP#tokens';
				290	$cache_file //= 'korapxml2krill.cache';
				291	$cache_size //= '50m';
				292	$jobs //= 0;
				293	$cache_delete //= 1;
				294	$cache_init //= 1;
				295	$sequential_extraction //= 0;
				296	$log_level //= 'ERROR';
				297	$base_sentences //= '';
				298	$base_paragraphs //= '';
				299	$base_pagebreaks //= '';
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	300
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	301	$base_sentences = lc $base_sentences;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	302	$base_paragraphs = lc $base_paragraphs;
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	303	$base_pagebreaks = lc $base_pagebreaks;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	304
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	305
				306	# Initialize log4perl object
				307	Log::Log4perl->init({
				308	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
				309	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				310	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				311	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				312	});
				313
				314	my $log = Log::Log4perl->get_logger('main');
				315
				316
				317	print "Reading config from $cfg_file\n" if $cfg_file;
				318
				319
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	320	my %ERROR_HASH = (
				321	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	322	-verbose => 99,
				323	-msg => $VERSION_MSG,
				324	-output => '-',
				325	-exit => 1
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	326	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	327
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	328	# Input has to be defined
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	329	pod2usage(%ERROR_HASH) unless @input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	330
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	331	# Gzip has no effect, if no output is given
				332	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	333
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	334
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	335	if ($jobs eq '-1') {
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	336	state $cores = Sys::Info->new->device('CPU')->count;
				337	$jobs = ceil(5 * $cores);
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	338	$log->info("Run using $jobs jobs on $cores cores");
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	339	};
				340
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	341
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	342	# Start serial processing
				343	if ($cmd eq 'serial') {
				344
				345	if ($output && (!-e $output \|\| !-d $output)) {
				346	print "Directory '$output' does not exist.\n\n";
				347	exit(0);
				348	};
				349
				350	# Remove all inputs
				351	my $remove_next = 0;
				352	@keep_argv = @{c(@keep_argv)->grep(
				353	sub {
				354	# Input flag
				355	if ($_ eq '-i' \|\| $_ eq '--input' \|\| $_ eq '--output' \|\| $_ eq '-o') {
				356	$remove_next = 1;
				357	return 0;
				358	}
				359
				360	# input value
				361	elsif ($remove_next) {
				362	$remove_next = 0;
				363	return 0;
				364	};
				365
				366	# Pass parameter
				367	return 1;
				368	}
				369	)->to_array};
				370
				371
				372	# Iterate over all inputs
				373	foreach (@input) {
				374
				375	my $new_out = catdir($output, get_file_name_from_glob($_));
				376
				377	# Create new path
Akron	bd3adda	2017-04-11 15:00:55 +0200	[diff] [blame]	378	if (make_path($new_out) == 0 && !-d $new_out) {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	379	$log->error("Can\'t create path $new_out");
				380	exit(0);
				381	};
				382
				383	# Create archive command
				384	my @archive_cmd = ($^X, $0, 'archive', @keep_argv, '-i', $_, '-o', $new_out);
				385	print "Start serial processing of $_ to $new_out\n";
				386
				387	# Start archiving
				388	system @archive_cmd;
				389	};
				390
				391	exit(0);
				392	};
				393
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	394	my %skip;
				395	$skip{lc($_)} = 1 foreach @skip;
				396
				397	my @layers;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	398	push(@layers, ['Base', 'Sentences']) unless $base_sentences;
				399	push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	400
				401	# Connexor
				402	push(@layers, ['Connexor', 'Morpho']);
				403	push(@layers, ['Connexor', 'Syntax']);
				404	push(@layers, ['Connexor', 'Phrase']);
				405	push(@layers, ['Connexor', 'Sentences']);
				406
				407	# CoreNLP
				408	push(@layers, ['CoreNLP', 'NamedEntities']);
				409	push(@layers, ['CoreNLP', 'Sentences']);
				410	push(@layers, ['CoreNLP', 'Morpho']);
				411	push(@layers, ['CoreNLP', 'Constituency']);
				412
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	413
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	414	# DeReKo
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	415	my @dereko_attr = ();
				416	if ($base_sentences eq 'dereko#structure') {
				417	push @dereko_attr, 'sentences';
				418	};
				419	if ($base_paragraphs eq 'dereko#structure') {
				420	push @dereko_attr, 'paragraphs';
				421	};
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	422
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	423	if ($base_pagebreaks eq 'dereko#structure') {
				424	push @dereko_attr, 'pagebreaks';
				425	};
				426
				427	if ($dereko_attr[0]) {
				428	push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	429	}
				430	else {
				431	push(@layers, ['DeReKo', 'Structure']);
				432	};
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	433
				434	# Glemm
				435	push(@layers, ['Glemm', 'Morpho']);
				436
				437	# Malt
				438	push(@layers, ['Malt', 'Dependency']);
				439
				440	# MDParser
				441	push(@layers, ['MDParser', 'Dependency']);
				442
				443	# Mate
				444	push(@layers, ['Mate', 'Morpho']);
				445	push(@layers, ['Mate', 'Dependency']);
				446
				447	# OpenNLP
				448	push(@layers, ['OpenNLP', 'Morpho']);
				449	push(@layers, ['OpenNLP', 'Sentences']);
				450
				451	# Schreibgebrauch
				452	push(@layers, ['Sgbr', 'Lemma']);
				453	push(@layers, ['Sgbr', 'Morpho']);
				454
				455	# TreeTagger
				456	push(@layers, ['TreeTagger', 'Morpho']);
				457	push(@layers, ['TreeTagger', 'Sentences']);
				458
				459	# XIP
				460	push(@layers, ['XIP', 'Morpho']);
				461	push(@layers, ['XIP', 'Constituency']);
				462	push(@layers, ['XIP', 'Sentences']);
				463	push(@layers, ['XIP', 'Dependency']);
				464
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	465	# DRuKoLa
				466	push(@layers, ['DRuKoLa', 'Morpho']);
				467
Akron	3bd942f	2017-02-20 20:09:14 +0100	[diff] [blame]	468	# Marmot
				469	push(@layers, ['MarMoT', 'Morpho']);
				470
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	471
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	472	# Check filters
				473	my @filtered_anno;
				474	if ($skip{'#all'}) {
				475	foreach (@anno) {
				476	push @filtered_anno, [ split('#', $_) ];
				477	};
				478	}
				479
				480	# Add all annotations that are not skipped
				481	else {
				482	# Add to index file - respect skipping
				483	foreach my $info (@layers) {
				484	# Skip if Foundry or Foundry#Layer should be skipped
				485	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				486	push @filtered_anno, $info;
				487	};
				488	};
				489	};
				490
				491	# Get tokenization basis
				492	my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
				493
				494	# TODO: This should not be initialized for batch
				495	my $cache = Cache::FastMmap->new(
				496	share_file => $cache_file,
				497	cache_size => $cache_size,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	498	init_file => $cache_init
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	499	);
				500
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	501	# Create batch object
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	502	my $batch_file = KorAP::XML::Batch::File->new(
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	503	cache => $cache,
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	504	meta_type => $meta,
				505	overwrite => $overwrite,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	506	foundry => $token_base_foundry,
				507	layer => $token_base_layer,
				508	gzip => $gzip,
				509	log => $log,
				510	primary => $primary,
				511	pretty => $pretty,
				512	anno => \@filtered_anno
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	513	);
				514
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	515	# Get file name based on path information
				516	sub get_file_name ($) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	517	my $i = $input[0];
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	518	if (-d $i) {
				519	$i =~ s![^\/]+$!!;
				520	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	521	my $file = shift;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	522
				523	# Remove temp dir fragments
Akron	6255760	2016-06-27 14:10:13 +0200	[diff] [blame]	524	$file =~ s!^/?tmp/[^/]+!!;
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	525	$file =~ s/^?\/?$i//;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	526	$file =~ tr/\//-/;
				527	$file =~ s{^-+}{};
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	528	$file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	529	return $file;
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	530	};
				531
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	532
				533	sub get_file_name_from_glob ($) {
				534	my $glob = shift;
Akron	bd3adda	2017-04-11 15:00:55 +0200	[diff] [blame]	535	$glob =~ s![\\\/]!-!g; # Transform paths
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	536	$glob =~ s/[\*\?]//g; # Remove arbitrary fills
				537	$glob =~ s/[\{\}\[\]]/-/g; # Remove class and multiple brackets
				538	$glob =~ s/\-\-+/-/g; # Remove sequences of binding characters
				539	$glob =~ s/^-//; # Clean beginning
				540	$glob =~ s/-$//; # Clean end
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	541	$glob =~ s/\.zip$//; # Remove file extension
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	542	return $glob;
				543	};
				544
				545
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	546	# Convert sigle to path construct
				547	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;
				548
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	549	if ($cmd) {
				550	if ($output && (!-e $output \|\| !-d $output)) {
				551	print "Directory '$output' does not exist.\n\n";
				552	exit(0);
				553	};
				554	};
				555
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	556
				557	# Glob and prefix files
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	558	if (@input) {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	559
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	560	my @new_input = ();
				561
				562	# Iterate over all inputs
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	563	foreach my $wild_card (@input) {
				564
				565	# Prefix with input root
				566	$wild_card = $input_base ? catfile($input_base, $wild_card) : $wild_card;
				567
				568	push (@new_input, bsd_glob($wild_card));
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	569	};
				570
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	571	# Sort files by length
				572	@input = sort { length($a) <=> length($b) } @new_input;
				573
				574	print 'Input is ' . join(', ', @input)."\n";
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	575	};
				576
				577
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	578	# Process a single file
				579	unless ($cmd) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	580	my $input = $input[0];
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	581
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	582	BEGIN {
				583	$main::TIME = Benchmark->new;
				584	$main::LAST_STOP = Benchmark->new;
				585	};
				586
				587	sub stop_time {
				588	my $new = Benchmark->new;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	589	$log->info(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	590	'The code took: '.
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	591	timestr(timediff($new, $main::LAST_STOP)) .
				592	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				593	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	594	$main::LAST_STOP = $new;
				595	};
				596
				597	# Create and parse new document
				598	$input =~ s{([^/])$}{$1/};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	599
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	600	# Process file
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	601	$batch_file->process($input, $output);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	602
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	603	# Delete cache file
				604	unlink($cache_file) if $cache_delete;
				605
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	606	stop_time;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	607	exit(1);
				608	};
				609
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	610
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	611	# Extract XML files
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	612	if ($cmd eq 'extract') {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	613
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	614	# Create new archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	615	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	616
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	617	# Check zip capabilities
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	618	unless ($archive->test_unzip) {
				619	print "Unzip is not installed or incompatible.\n\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	620	exit(0);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	621	};
				622
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	623	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	624	$archive->attach($_) foreach @input[1..$#input];
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	625
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	626	my $prefix = 1;
				627
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	628	# No sigles given
				629	unless (@sigle) {
				630
				631	# Get files
				632	foreach ($archive->list_texts) {
				633
				634	# Split path information
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	635	($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	636
				637	# TODO: Make this OS independent
				638	push @sigle, join '/', $corpus, $doc, $text;
				639	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	640	}
				641
				642	# Check sigle for doc sigles
				643	else {
				644	my @new_sigle;
				645
				646	my $prefix_check = 0;
				647
				648	# Iterate over all sigle
				649	foreach (@sigle) {
				650
				651	# Sigle is a doc sigle
				652	if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	653
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	654	print "$_ ...";
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	655	# Check if a prefix is needed
				656	unless ($prefix_check) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	657
				658	if ($prefix = $archive->check_prefix) {
				659	print " with prefix ...";
				660	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	661	$prefix_check = 1;
				662	};
				663
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	664	print "\n";
				665
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	666	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	667	my $path = ($prefix ? './' : '') . $_;
				668
				669	print '... ' . (
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	670	$archive->extract_doc(
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	671	$path, $output, $sequential_extraction ? 1 : $jobs
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	672	) ? '' : 'not '
				673	);
				674	print "extracted.\n";
				675	}
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	676
				677	# Sigle is a text sigle
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	678	else {
				679	push @new_sigle, $_;
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	680
				681	unless ($prefix_check) {
				682
				683	if ($prefix = $archive->check_prefix) {
				684	print " with prefix ...";
				685	};
				686	$prefix_check = 1;
				687	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	688	};
				689	};
				690	@sigle = @new_sigle;
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	691	};
				692
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	693	# Iterate over all given sigles and extract
				694	foreach (@sigle) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	695
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	696	print "$_ ...\n";
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	697
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	698	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	699	print '... ' . (
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	700
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	701	$archive->extract_text(
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	702	($prefix ? './' : '') . $_, $output
				703	) ? '' : 'not '
				704	);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	705	print "extracted.\n";
				706	};
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	707	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	708
				709	# Can't create archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	710	else {
				711	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	712	exit(1);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	713	};
				714	}
				715
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	716
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	717	# Process an archive
				718	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	719
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	720	my $archive_output;
				721
				722	# First extract, then archive
				723	if (defined $extract_dir) {
				724
				725	# Create new archive object
				726	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
				727
				728	# Check zip capabilities
				729	unless ($archive->test_unzip) {
				730	print "Unzip is not installed or incompatible.\n\n";
				731	exit(0);
				732	};
				733
				734	# Add further annotation archived
				735	$archive->attach($_) foreach @input[1..$#input];
				736
				737	# Create a temporary directory
				738	if ($extract_dir eq ':temp:') {
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	739	$extract_dir = tempdir(CLEANUP => 0);
				740	print "Temporarily extract to $extract_dir\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	741	};
				742
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	743	# Add some random extra to avoid clashes with multiple archives
				744	$extract_dir = catdir($extract_dir, random_string('cccccc'));
				745
				746	# Extract to temprary directory
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	747	if ($archive->extract_all($extract_dir, $sequential_extraction ? 1: $jobs)) {
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	748	@input = ($extract_dir);
				749	}
				750	else {
				751	$log->error('Unable to extract from primary archive ' . $input[0] .
				752	' to ' . $extract_dir);
				753	exit(1);
				754	};
				755	}
				756
				757	# Can't create archive object
				758	else {
				759	$log->error('Unable to extract from primary archive ' . $input[0]);
				760	exit(1);
				761	};
				762	};
				763
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	764	# TODO: Support sigles
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	765
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	766	# Zero means: everything runs in the parent process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	767	my $pool = Parallel::ForkManager->new($jobs);
				768
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	769	my $count = 0; # Texts to process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	770	my $iter = 1; # Current text in process
				771
				772	# Report on fork message
				773	$pool->run_on_finish (
				774	sub {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	775	my ($pid, $code) = @_;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	776	my $data = pop;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	777
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	778	print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	779	($iter++) . "/$count]" .
				780	($code ? " $code" : '') .
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	781	' ' . $data->[0] . "\n";
				782	$data->[1] = undef if $data->[1];
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	783	}
				784	);
				785
				786	my $t;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	787	my $temp;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	788	print "Reading data ...\n";
				789
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	790	# unless (Cache::FastMmap->new(
				791	# share_file => $cache_file,
				792	# cache_size => $cache_size,
				793	# init_file => $cache_init
				794	# )) {
				795	# print "Unable to intialize cache '$cache_file'\n\n";
				796	# exit(1);
				797	# };
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	798
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	799	# Input is a directory
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	800	if (-d $input[0]) {
				801	my $it = Directory::Iterator->new($input[0]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	802	my @dirs;
				803	my $dir;
				804
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	805	# Todo: Make a DO WHILE
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	806	while (1) {
				807	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	808	push @dirs, $dir;
				809	$it->prune;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	810	};
				811	last unless $it->next;
				812	};
				813
				814	print "Start processing ...\n";
				815	$t = Benchmark->new;
				816	$count = scalar @dirs;
				817
				818	DIRECTORY_LOOP:
				819	for (my $i = 0; $i < $count; $i++) {
				820
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	821	my $filename = catfile(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	822	$output,
				823	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	824	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	825
				826	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	827	$pool->start and next DIRECTORY_LOOP;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	828
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	829	if (my $return = $batch_file->process($dirs[$i] => $filename)) {
				830	$pool->finish(
				831	0,
				832	["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
				833	);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	834	}
				835	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	836	$pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	837	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	838	};
				839	}
				840
				841	# Input is a file
Akron	29866ac	2016-06-24 16:40:47 +0200	[diff] [blame]	842	elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	843
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	844	unless ($archive->test_unzip) {
				845	print "Unzip is not installed or incompatible.\n\n";
				846	exit(1);
				847	};
				848
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	849	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	850	$archive->attach($_) foreach @input[1..$#input];
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	851
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	852	print "Start processing ...\n";
				853	$t = Benchmark->new;
				854	my @dirs = $archive->list_texts;
				855	$count = scalar @dirs;
				856
				857	ARCHIVE_LOOP:
				858	for (my $i = 0; $i < $count; $i++) {
				859
				860	# Split path information
				861	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				862
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	863	my $filename = catfile(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	864	$output,
				865	get_file_name(
				866	catfile($corpus, $doc, $text)
				867	. '.json' . ($gzip ? '.gz' : '')
				868	)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	869	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	870
				871	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	872	$pool->start and next ARCHIVE_LOOP;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	873
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	874	# Create temporary file
				875	$temp = File::Temp->newdir;
				876
Akron	bdf434a	2016-10-24 17:42:07 +0200	[diff] [blame]	877	# TODO: Check if $filename exist at the beginning,
				878	# because extraction can be horrible slow!
				879
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	880	# Extract from archive
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	881	if ($archive->extract_text($dirs[$i], $temp)) {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	882
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	883	# Create corpus directory
				884	my $input = catdir("$temp", $corpus);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	885
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	886	# Temporary directory
				887	my $dir = catdir($input, $doc, $text);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	888
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	889	# Write file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	890	if (my $return = $batch_file->process($dir => $filename)) {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	891	# Delete temporary file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	892	$pool->finish(
				893	0,
				894	["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
				895	);
				896	#$pool->finish(0, ["Processed " . $filename, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	897	}
				898	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	899	# Delete temporary file
				900	$pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	901	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	902	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	903
				904	# Unable to extract
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	905	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	906	$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	907	};
				908	};
				909	}
				910
				911	else {
				912	print "Input is neither a directory nor an archive.\n\n";
				913	};
				914
				915	$pool->wait_all_children;
				916
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	917	# Delete cache file
				918	unlink($cache_file) if $cache_delete;
				919
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	920	print timestr(timediff(Benchmark->new, $t))."\n";
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	921	print "Done.\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	922	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	923
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	924
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	925	# Cleanup temporary extraction directory
				926	if ($extract_dir) {
				927	my $objects = remove_tree($extract_dir, { safe => 1 });
				928	print "Removed directory $extract_dir with $objects objects.\n";
				929	};
				930
				931
				932	print "\n";
				933
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	934	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	935
				936	=pod
				937
				938	=encoding utf8
				939
				940	=head1 NAME
				941
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	942	korapxml2krill - Merge KorapXML data and create Krill documents
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	943
				944
				945	=head1 SYNOPSIS
				946
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	947	korapxml2krill [archive\|extract] --input <directory\|archive> [options]
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	948
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	949
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	950	=head1 DESCRIPTION
				951
				952	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				953	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	954	The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	955
				956
				957	=head1 INSTALLATION
				958
				959	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				960
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	961	$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	962
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	963	In case everything went well, the C<korapxml2krill> tool will
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	964	be available on your command line immediately.
Akron	7438151	2016-10-14 11:56:22 +0200	[diff] [blame]	965	Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	966	In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	967
				968	=head1 ARGUMENTS
				969
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	970	$ korapxml2krill -z --input <directory> --output <filename>
				971
				972	Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	973	It expects the input to point to the text level folder.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	974
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	975	=over 2
				976
				977	=item B<archive>
				978
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	979	$ korapxml2krill archive -z --input <directory\|archive> --output <directory>
				980
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	981	Converts an archive of KorAP-XML documents. It expects a directory
				982	(pointing to the corpus level folder) or one or more zip files as input.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	983
				984	=item B<extract>
				985
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	986	$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
				987
				988	Extracts KorAP-XML documents from a zip file.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	989
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	990	=item B<serial>
				991
				992	$ korapxml2krill serial -i <archive1> -i <archive2> -o <directory> -cfg <config-file>
				993
				994	Convert archives sequentially. The inputs are not merged but treated
				995	as they are (so they may be premerged or globs).
				996	the C<--out> directory is treated as the base directory where subdirectories
				997	are created based on the archive name.
				998
				999
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1000	=back
				1001
				1002
				1003	=head1 OPTIONS
				1004
				1005	=over 2
				1006
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1007	=item B<--input\|-i> <directory\|zip file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1008
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1009	Directory or zip file(s) of documents to convert.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1010
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1011	Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akron	f1a1de9	2016-11-02 17:32:12 +0100	[diff] [blame]	1012	document, while C<archive> expects a KorAP-XML corpus folder or a zip
				1013	file to batch process multiple files.
				1014	C<extract> expects zip files only.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1015
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1016	C<archive> supports multiple input zip files with the constraint,
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1017	that the first archive listed contains all primary data files
				1018	and all meta data files.
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1019
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1020	-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1021
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1022	Input may also be defined using BSD glob wildcards.
				1023
				1024	-i 'file/news*.zip'
				1025
				1026	The extended input array will be sorted in length order, so the shortest
				1027	path needs to contain all primary data files and all meta data files.
				1028
Akron	0c3e375	2016-06-28 15:55:53 +0200	[diff] [blame]	1029	(The directory structure follows the base directory format,
				1030	that may include a C<.> root folder.
				1031	In this case further archives lacking a C<.> root folder
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1032	need to be passed with a hash sign in front of the archive's name.
				1033	This may require to quote the parameter.)
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	1034
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1035	To support zip files, a version of C<unzip> needs to be installed that is
				1036	compatible with the archive file.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	1037
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	1038	B<The root folder switch using the hash sign is experimental and
				1039	may vanish in future versions.>
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	1040
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1041	=item B<--input-base\|-ib> <directory>
				1042
				1043	The base directory for inputs.
				1044
				1045
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1046	=item B<--output\|-o> <directory\|file>
				1047
				1048	Output folder for archive processing or
				1049	document name for single output (optional),
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1050	writes to C<STDOUT> by default
				1051	(in case C<output> is not mandatory due to further options).
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1052
				1053	=item B<--overwrite\|-w>
				1054
				1055	Overwrite files that already exist.
				1056
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1057	=item B<--token\|-t> <foundry>#<file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1058
				1059	Define the default tokenization by specifying
				1060	the name of the foundry and optionally the name
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1061	of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1062
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	1063
				1064	=item B<--base-sentences\|-bs> <foundry>#<layer>
				1065
				1066	Define the layer for base sentences.
				1067	If given, this will be used instead of using C<Base#Sentences>.
				1068	Currently C<DeReKo#Structure> is the only additional layer supported.
				1069
				1070	Defaults to unset.
				1071
				1072
				1073	=item B<--base-paragraphs\|-bp> <foundry>#<layer>
				1074
				1075	Define the layer for base paragraphs.
				1076	If given, this will be used instead of using C<Base#Paragraphs>.
				1077	Currently C<DeReKo#Structure> is the only additional layer supported.
				1078
				1079	Defaults to unset.
				1080
				1081
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	1082	=item B<--base-pagebreaks\|-bpb> <foundry>#<layer>
				1083
				1084	Define the layer for base pagebreaks.
				1085	Currently C<DeReKo#Structure> is the only layer supported.
				1086
				1087	Defaults to unset.
				1088
				1089
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1090	=item B<--skip\|-s> <foundry>[#<layer>]
				1091
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1092	Skip specific annotations by specifying the foundry
				1093	(and optionally the layer with a C<#>-prefix),
				1094	e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1095	Can be set multiple times.
				1096
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1097	=item B<--anno\|-a> <foundry>#<layer>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1098
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1099	Convert specific annotations by specifying the foundry
				1100	(and optionally the layer with a C<#>-prefix),
				1101	e.g. C<Mate> or C<Mate#Morpho>.
				1102	Can be set multiple times.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1103
				1104	=item B<--primary\|-p>
				1105
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1106	Output primary data or not. Defaults to C<true>.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1107	Can be flagged using C<--no-primary> as well.
				1108	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1109
				1110	=item B<--jobs\|-j>
				1111
				1112	Define the number of concurrent jobs in seperated forks
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1113	for archive processing.
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1114	Defaults to C<0> (everything runs in a single process).
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	1115
				1116	If C<sequential-extraction> is not set to false, this will
				1117	also apply to extraction.
				1118
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	1119	Pass -1, and the value will be set automatically to 5
				1120	times the number of available cores.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1121	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1122
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	1123	=item B<--sequential-extraction\|-se>
				1124
				1125	Flag to indicate, if the C<jobs> value also applies to extraction.
				1126	Some systems may have problems with extracting multiple archives
				1127	to the same folder at the same time.
				1128	Can be flagged using C<--no-sequential-extraction> as well.
				1129	Defaults to C<false>.
				1130
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1131	=item B<--meta\|-m>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1132
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1133	Define the metadata parser to use. Defaults to C<I5>.
				1134	Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
				1135	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1136
				1137	=item B<--pretty\|-y>
				1138
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1139	Pretty print JSON output. Defaults to C<false>.
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1140	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1141
				1142	=item B<--gzip\|-z>
				1143
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1144	Compress the output.
				1145	Expects a defined C<output> file in single processing.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1146
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1147	=item B<--cache\|-c>
				1148
				1149	File to mmap a cache (using L<Cache::FastMmap>).
				1150	Defaults to C<korapxml2krill.cache> in the calling directory.
				1151
				1152	=item B<--cache-size\|-cs>
				1153
				1154	Size of the cache. Defaults to C<50m>.
				1155
				1156	=item B<--cache-init\|-ci>
				1157
				1158	Initialize cache file.
				1159	Can be flagged using C<--no-cache-init> as well.
				1160	Defaults to C<true>.
				1161
				1162	=item B<--cache-delete\|-cd>
				1163
				1164	Delete cache file after processing.
				1165	Can be flagged using C<--no-cache-delete> as well.
				1166	Defaults to C<true>.
				1167
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1168	=item B<--config\|-cfg>
				1169
				1170	Configure the parameters of your call in a file
				1171	of key-value pairs with whitespace separator
				1172
				1173	overwrite 1
				1174	token DeReKo#Structure
				1175	...
				1176
				1177	Supported parameters are:
Akron	63f20d4	2017-04-10 23:40:29 +0200	[diff] [blame]	1178	C<overwrite>, C<gzip>, C<jobs>, C<input-base>,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1179	C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron	9ec8887	2017-04-12 16:29:06 +0200	[diff] [blame^]	1180	C<output>,
				1181	C<temp-extract>, C<sequential-extraction>,
				1182	C<base-sentences>, C<base-paragraphs>,
				1183	C<base-pagebreaks>,
				1184	C<skip> (semicolon separated), C<sigle>
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1185	(semicolon separated), C<anno> (semicolon separated).
				1186
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1187	=item B<--temporary-extract\|-te>
				1188
				1189	Only valid for the C<archive> command.
				1190
				1191	This will first extract all files into a
				1192	directory and then will archive.
				1193	If the directory is given as C<:temp:>,
				1194	a temporary directory is used.
				1195	This is especially useful to avoid
				1196	massive unzipping and potential
				1197	network latency.
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1198
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1199	=item B<--sigle\|-sg>
				1200
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1201	Extract the given texts.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1202	Can be set multiple times.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1203	I<Currently only supported on C<extract>.>
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	1204	Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1205	In case the C<Text> path is omitted, the whole document will be extracted.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1206	On the document level, the postfix wildcard C<*> is supported.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1207
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1208	=item B<--log\|-l>
				1209
				1210	The L<Log4perl> log level, defaults to C<ERROR>.
				1211
				1212	=item B<--help\|-h>
				1213
				1214	Print this document.
				1215
				1216	=item B<--version\|-v>
				1217
				1218	Print version information.
				1219
				1220	=back
				1221
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1222	=head1 ANNOTATION SUPPORT
				1223
				1224	L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
				1225	developed in the KorAP project that are part of the KorAP preprocessing pipeline.
				1226	The base foundry with paragraphs, sentences, and the text element are mandatory for
				1227	L<Krill\|https://github.com/KorAP/Krill>.
				1228
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1229	Base
				1230	#Paragraphs
				1231	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1232
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1233	Connexor
				1234	#Morpho
				1235	#Phrase
				1236	#Sentences
				1237	#Syntax
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1238
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1239	CoreNLP
				1240	#Constituency
				1241	#Morpho
				1242	#NamedEntities
				1243	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1244
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1245	DeReKo
				1246	#Structure
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1247
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1248	DRuKoLa
				1249	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1250
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1251	Glemm
				1252	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1253
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1254	Malt
				1255	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1256
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1257	MarMoT
				1258	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1259
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1260	Mate
				1261	#Dependency
				1262	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1263
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1264	MDParser
				1265	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1266
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1267	OpenNLP
				1268	#Morpho
				1269	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1270
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1271	Sgbr
				1272	#Lemma
				1273	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1274
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1275	TreeTagger
				1276	#Morpho
				1277	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1278
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1279	XIP
				1280	#Constituency
				1281	#Morpho
				1282	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1283
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1284
				1285	More importers are in preparation.
				1286	New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
				1287	See the built-in annotation importers as examples.
				1288
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1289	=head1 AVAILABILITY
				1290
				1291	https://github.com/KorAP/KorAP-XML-Krill
				1292
				1293
				1294	=head1 COPYRIGHT AND LICENSE
				1295
Akron	3ec0a1c	2017-01-18 14:41:55 +0100	[diff] [blame]	1296	Copyright (C) 2015-2017, L<IDS Mannheim\|http://www.ids-mannheim.de/>
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1297
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1298	Author: L<Nils Diewald\|http://nils-diewald.de/>
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame]	1299
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1300	Contributor: Eliza Margaretha
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1301
				1302	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				1303	Corpus Analysis Platform at the
				1304	L<Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				1305	member of the
				1306	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
				1307
				1308	This program is free software published under the
				1309	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				1310
				1311	=cut