Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: af1da02bec8daef2a6de1591171f94366a89c183 [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	4	use FindBin;
				5	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				6	use File::Spec::Functions qw/catfile catdir/;
				7	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	8	use Benchmark qw/:hireswallclock/;
				9	use IO::Compress::Gzip qw/$GzipError/;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	10	use POSIX qw/ceil/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	11	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	12	use Pod::Usage;
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	13	use Cache::FastMmap;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	14	use Directory::Iterator;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	15	use KorAP::XML::Krill;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	16	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	17	use KorAP::XML::Tokenizer;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	18	use KorAP::XML::Batch::File;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	19	use Parallel::ForkManager;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	20	use v5.10;
				21	use Sys::Info;
				22	use Sys::Info::Constants qw( :device_cpu );
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	23	use File::Glob ':bsd_glob';
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	24
				25	# use KorAP::XML::ForkPool;
Akron	75ba57d	2016-03-07 23:36:27 +0100	[diff] [blame]	26	# TODO: use Parallel::Loops
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	27	# TODO: make output files
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	28
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	29	# TODO: Use KorAP::XML::ForkPool!
				30
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	31	# CHANGES:
				32	# ----------------------------------------------------------
				33	# 2013/11/25
				34	# - Initial release
				35	#
				36	# 2014/10/29
				37	# - Merges foundry data to create indexer friendly documents
				38	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	39	# 2016/02/04
				40	# - renamed to korapxml2krill
				41	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	42	#
				43	# 2016/02/12
				44	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	45	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	46	#
				47	# 2016/02/14
				48	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	49	# - Added support for archive files
				50	#
				51	# 2016/02/15
				52	# - Fixed temporary directory bug
				53	# - Improved skipping before unzipping
				54	# - Added EXPERIMENTAL concurrency support
				55	#
				56	# 2016/02/23
				57	# - Merge korapxml2krill and korapxml2krill_dir
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	58	#
				59	# 2016/02/27
				60	# - Added extract function
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	61	#
				62	# 2016/03/17
				63	# - Added meta switch
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	64	#
				65	# 2016/03/18
				66	# - Added meta data caching
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	67	#
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	68	# 2016/06/27
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	69	# - Added multi archive support
				70	# - Added prefix negation support
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	71	# - Added Malt#Dependency support
Akron	8b99052	2016-07-06 16:45:57 +0200	[diff] [blame]	72	#
				73	# 2016/07/06
				74	# - Added MDParser#Dependency
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	75	#
				76	# 2016/10/15
Nils Diewald	0e48977	2016-10-24 15:16:52 +0200	[diff] [blame]	77	# - Fixed temporary path issue in script
				78	#
				79	# 2016/10/24
				80	# - Improved Windows support
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	81	#
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	82	# 2016/10/24
				83	# - Added support for document extraction
				84	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	85	# 2016/10/27
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	86	# - Added wildcard support for document extraction
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	87	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	88	# 2016/12/21
				89	# - added support for base-sentences and base-tokenizations
				90	#
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	91	# 2017/01/20
				92	# - added support for DRuKoLa annotations
				93	#
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	94	# 2017/02/08
				95	# - added support for pagebreak annotations
				96	#
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	97	# 2017/04/06
				98	# - added support for wildcards in input
				99	#
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	100	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	101
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	102	our $LAST_CHANGE = '2017/04/06';
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	103	our $LOCAL = $FindBin::Bin;
				104	our $VERSION_MSG = <<"VERSION";
				105	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				106	VERSION
				107
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	108	# Parse comand
				109	my $cmd;
				110	our @ARGV;
				111	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				112	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	113	};
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	114
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	115	my (@skip, @sigle, @anno, @input);
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	116	my $text;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	117
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	118	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	119	GetOptions(
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	120	'input\|i=s' => \@input,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	121	'output\|o=s' => \(my $output),
				122	'overwrite\|w' => \(my $overwrite),
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	123	'meta\|m=s' => \(my $meta),
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	124	'token\|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	125	'base-sentences\|bs=s' => \(my $base_sentences = ''),
				126	'base-paragraphs\|bp=s' => \(my $base_paragraphs = ''),
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	127	'base-pagebreaks\|bpb=s' => \(my $base_pagebreaks = ''),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	128	'gzip\|z' => \(my $gzip),
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	129	'skip\|s=s' => \@skip,
				130	'sigle\|sg=s' => \@sigle,
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	131	'cache\|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	132	'log\|l=s' => \(my $log_level = 'ERROR'),
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	133	'anno\|a=s' => \@anno,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	134	'primary\|p!' => \(my $primary),
				135	'pretty\|y' => \(my $pretty),
				136	'jobs\|j=i' => \(my $jobs = 0),
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	137	'cache-size\|cs=s' => \(my $cache_size = '50m'),
				138	'cache-delete\|cd!' => \(my $cache_delete = 1),
				139	'cache-init\|ci!' => \(my $cache_init = 1),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	140	'help\|h' => sub {
				141	pod2usage(
				142	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	143	-verbose => 99,
				144	-msg => $VERSION_MSG,
				145	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	146	);
				147	},
				148	'version\|v' => sub {
				149	pod2usage(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	150	-verbose => 0,
				151	-msg => $VERSION_MSG,
				152	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	153	)
				154	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	155	);
				156
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	157	$base_sentences = lc $base_sentences;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	158	$base_paragraphs = lc $base_paragraphs;
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	159	$base_pagebreaks = lc $base_pagebreaks;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	160
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	161	my %ERROR_HASH = (
				162	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	163	-verbose => 99,
				164	-msg => $VERSION_MSG,
				165	-output => '-',
				166	-exit => 1
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	167	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	168
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	169	# Input has to be defined
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	170	pod2usage(%ERROR_HASH) unless @input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	171
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	172	# Gzip has no effect, if no output is given
				173	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	174
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	175	# Initialize log4perl object
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	176	Log::Log4perl->init({
				177	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
				178	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				179	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				180	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				181	});
				182
				183	my $log = Log::Log4perl->get_logger('main');
				184
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	185
				186	if ($jobs == -1) {
				187	state $cores = Sys::Info->new->device('CPU')->count;
				188	$jobs = ceil(5 * $cores);
				189	$log->info("Run using $jobs jobs");
				190	};
				191
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	192
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	193	my %skip;
				194	$skip{lc($_)} = 1 foreach @skip;
				195
				196	my @layers;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	197	push(@layers, ['Base', 'Sentences']) unless $base_sentences;
				198	push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	199
				200	# Connexor
				201	push(@layers, ['Connexor', 'Morpho']);
				202	push(@layers, ['Connexor', 'Syntax']);
				203	push(@layers, ['Connexor', 'Phrase']);
				204	push(@layers, ['Connexor', 'Sentences']);
				205
				206	# CoreNLP
				207	push(@layers, ['CoreNLP', 'NamedEntities']);
				208	push(@layers, ['CoreNLP', 'Sentences']);
				209	push(@layers, ['CoreNLP', 'Morpho']);
				210	push(@layers, ['CoreNLP', 'Constituency']);
				211
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	212
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	213	# DeReKo
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	214	my @dereko_attr = ();
				215	if ($base_sentences eq 'dereko#structure') {
				216	push @dereko_attr, 'sentences';
				217	};
				218	if ($base_paragraphs eq 'dereko#structure') {
				219	push @dereko_attr, 'paragraphs';
				220	};
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	221
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	222	if ($base_pagebreaks eq 'dereko#structure') {
				223	push @dereko_attr, 'pagebreaks';
				224	};
				225
				226	if ($dereko_attr[0]) {
				227	push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	228	}
				229	else {
				230	push(@layers, ['DeReKo', 'Structure']);
				231	};
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	232
				233	# Glemm
				234	push(@layers, ['Glemm', 'Morpho']);
				235
				236	# Malt
				237	push(@layers, ['Malt', 'Dependency']);
				238
				239	# MDParser
				240	push(@layers, ['MDParser', 'Dependency']);
				241
				242	# Mate
				243	push(@layers, ['Mate', 'Morpho']);
				244	push(@layers, ['Mate', 'Dependency']);
				245
				246	# OpenNLP
				247	push(@layers, ['OpenNLP', 'Morpho']);
				248	push(@layers, ['OpenNLP', 'Sentences']);
				249
				250	# Schreibgebrauch
				251	push(@layers, ['Sgbr', 'Lemma']);
				252	push(@layers, ['Sgbr', 'Morpho']);
				253
				254	# TreeTagger
				255	push(@layers, ['TreeTagger', 'Morpho']);
				256	push(@layers, ['TreeTagger', 'Sentences']);
				257
				258	# XIP
				259	push(@layers, ['XIP', 'Morpho']);
				260	push(@layers, ['XIP', 'Constituency']);
				261	push(@layers, ['XIP', 'Sentences']);
				262	push(@layers, ['XIP', 'Dependency']);
				263
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	264	# DRuKoLa
				265	push(@layers, ['DRuKoLa', 'Morpho']);
				266
Akron	3bd942f	2017-02-20 20:09:14 +0100	[diff] [blame]	267	# Marmot
				268	push(@layers, ['MarMoT', 'Morpho']);
				269
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	270
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	271	# Check filters
				272	my @filtered_anno;
				273	if ($skip{'#all'}) {
				274	foreach (@anno) {
				275	push @filtered_anno, [ split('#', $_) ];
				276	};
				277	}
				278
				279	# Add all annotations that are not skipped
				280	else {
				281	# Add to index file - respect skipping
				282	foreach my $info (@layers) {
				283	# Skip if Foundry or Foundry#Layer should be skipped
				284	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				285	push @filtered_anno, $info;
				286	};
				287	};
				288	};
				289
				290	# Get tokenization basis
				291	my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
				292
				293	# TODO: This should not be initialized for batch
				294	my $cache = Cache::FastMmap->new(
				295	share_file => $cache_file,
				296	cache_size => $cache_size,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	297	init_file => $cache_init
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	298	);
				299
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	300	# Create batch object
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	301	my $batch_file = KorAP::XML::Batch::File->new(
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	302	cache => $cache,
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	303	meta_type => $meta,
				304	overwrite => $overwrite,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	305	foundry => $token_base_foundry,
				306	layer => $token_base_layer,
				307	gzip => $gzip,
				308	log => $log,
				309	primary => $primary,
				310	pretty => $pretty,
				311	anno => \@filtered_anno
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	312	);
				313
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	314
				315	# Get file name based on path information
				316	sub get_file_name ($) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	317	my $i = $input[0];
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	318	if (-d $i) {
				319	$i =~ s![^\/]+$!!;
				320	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	321	my $file = shift;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	322
				323	# Remove temp dir fragments
Akron	6255760	2016-06-27 14:10:13 +0200	[diff] [blame]	324	$file =~ s!^/?tmp/[^/]+!!;
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	325	$file =~ s/^?\/?$i//;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	326	$file =~ tr/\//-/;
				327	$file =~ s{^-+}{};
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	328	$file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	329	return $file;
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	330	};
				331
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	332	# Convert sigle to path construct
				333	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;
				334
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	335	if ($cmd) {
				336	if ($output && (!-e $output \|\| !-d $output)) {
				337	print "Directory '$output' does not exist.\n\n";
				338	exit(0);
				339	};
				340	};
				341
				342
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	343	# Glob files
				344	if (@input) {
				345	my @new_input = ();
				346
				347	# Iterate over all inputs
				348	foreach (@input) {
				349	push (@new_input, bsd_glob($_));
				350	};
				351
				352	if (scalar(@new_input) > scalar(@input)) {
				353	@input = sort { length($a) <=> length($b) } @new_input;
				354	print 'Input rewritten to ' . join(',', @input);
				355	};
				356	};
				357
				358
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	359	# Process a single file
				360	unless ($cmd) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	361	my $input = $input[0];
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	362
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	363	BEGIN {
				364	$main::TIME = Benchmark->new;
				365	$main::LAST_STOP = Benchmark->new;
				366	};
				367
				368	sub stop_time {
				369	my $new = Benchmark->new;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	370	$log->info(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	371	'The code took: '.
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	372	timestr(timediff($new, $main::LAST_STOP)) .
				373	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				374	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	375	$main::LAST_STOP = $new;
				376	};
				377
				378	# Create and parse new document
				379	$input =~ s{([^/])$}{$1/};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	380
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	381	# Process file
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	382	$batch_file->process($input, $output);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	383
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	384	# Delete cache file
				385	unlink($cache_file) if $cache_delete;
				386
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	387	stop_time;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	388	}
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	389
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	390	# Extract XML files
				391	elsif ($cmd eq 'extract') {
				392
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	393	# Create new archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	394	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	395
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	396	# Check zip capabilities
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	397	unless ($archive->test_unzip) {
				398	print "Unzip is not installed or incompatible.\n\n";
				399	exit(1);
				400	};
				401
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	402	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	403	$archive->attach($_) foreach @input[1..$#input];
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	404
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	405	my $prefix = 1;
				406
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	407	# No sigles given
				408	unless (@sigle) {
				409
				410	# Get files
				411	foreach ($archive->list_texts) {
				412
				413	# Split path information
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	414	($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	415
				416	# TODO: Make this OS independent
				417	push @sigle, join '/', $corpus, $doc, $text;
				418	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	419	}
				420
				421	# Check sigle for doc sigles
				422	else {
				423	my @new_sigle;
				424
				425	my $prefix_check = 0;
				426
				427	# Iterate over all sigle
				428	foreach (@sigle) {
				429
				430	# Sigle is a doc sigle
				431	if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	432
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	433	print "$_ ...";
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	434	# Check if a prefix is needed
				435	unless ($prefix_check) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	436
				437	if ($prefix = $archive->check_prefix) {
				438	print " with prefix ...";
				439	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	440	$prefix_check = 1;
				441	};
				442
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	443	print "\n";
				444
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	445	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	446	my $path = ($prefix ? './' : '') . $_;
				447
				448	print '... ' . (
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	449	$archive->extract_doc(
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	450	$path, $output, $jobs
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	451	) ? '' : 'not '
				452	);
				453	print "extracted.\n";
				454	}
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	455
				456	# Sigle is a text sigle
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	457	else {
				458	push @new_sigle, $_;
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	459
				460	unless ($prefix_check) {
				461
				462	if ($prefix = $archive->check_prefix) {
				463	print " with prefix ...";
				464	};
				465	$prefix_check = 1;
				466	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	467	};
				468	};
				469	@sigle = @new_sigle;
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	470	};
				471
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	472	# Iterate over all given sigles and extract
				473	foreach (@sigle) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	474
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	475	print "$_ ...\n";
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	476
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	477	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	478	print '... ' . (
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	479
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	480	$archive->extract_text(
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	481	($prefix ? './' : '') . $_, $output
				482	) ? '' : 'not '
				483	);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	484	print "extracted.\n";
				485	};
				486
				487	print "\n";
				488	exit(1);
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	489	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	490
				491	# Can't create archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	492	else {
				493	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	494	};
				495	}
				496
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	497	# Process an archive
				498	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	499
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	500	# TODO: Support sigles
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	501
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	502	# Zero means: everything runs in the parent process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	503	my $pool = Parallel::ForkManager->new($jobs);
				504
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	505	my $count = 0; # Texts to process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	506	my $iter = 1; # Current text in process
				507
				508	# Report on fork message
				509	$pool->run_on_finish (
				510	sub {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	511	my ($pid, $code) = @_;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	512	my $data = pop;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	513
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	514	print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	515	($iter++) . "/$count]" .
				516	($code ? " $code" : '') .
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	517	' ' . $data->[0] . "\n";
				518	$data->[1] = undef if $data->[1];
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	519	}
				520	);
				521
				522	my $t;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	523	my $temp;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	524	print "Reading data ...\n";
				525
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	526	# unless (Cache::FastMmap->new(
				527	# share_file => $cache_file,
				528	# cache_size => $cache_size,
				529	# init_file => $cache_init
				530	# )) {
				531	# print "Unable to intialize cache '$cache_file'\n\n";
				532	# exit(1);
				533	# };
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	534
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	535	# Input is a directory
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	536	if (-d $input[0]) {
				537	my $it = Directory::Iterator->new($input[0]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	538	my @dirs;
				539	my $dir;
				540
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	541	# Todo: Make a DO WHILE
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	542	while (1) {
				543	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	544	push @dirs, $dir;
				545	$it->prune;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	546	};
				547	last unless $it->next;
				548	};
				549
				550	print "Start processing ...\n";
				551	$t = Benchmark->new;
				552	$count = scalar @dirs;
				553
				554	DIRECTORY_LOOP:
				555	for (my $i = 0; $i < $count; $i++) {
				556
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	557	my $filename = catfile(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	558	$output,
				559	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	560	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	561
				562	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	563	$pool->start and next DIRECTORY_LOOP;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	564
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	565	if (my $return = $batch_file->process($dirs[$i] => $filename)) {
				566	$pool->finish(
				567	0,
				568	["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
				569	);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	570	}
				571	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	572	$pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	573	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	574	};
				575	}
				576
				577	# Input is a file
Akron	29866ac	2016-06-24 16:40:47 +0200	[diff] [blame]	578	elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	579
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	580	unless ($archive->test_unzip) {
				581	print "Unzip is not installed or incompatible.\n\n";
				582	exit(1);
				583	};
				584
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	585	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	586	$archive->attach($_) foreach @input[1..$#input];
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	587
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	588	print "Start processing ...\n";
				589	$t = Benchmark->new;
				590	my @dirs = $archive->list_texts;
				591	$count = scalar @dirs;
				592
				593	ARCHIVE_LOOP:
				594	for (my $i = 0; $i < $count; $i++) {
				595
				596	# Split path information
				597	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				598
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	599	my $filename = catfile(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	600	$output,
				601	get_file_name(
				602	catfile($corpus, $doc, $text)
				603	. '.json' . ($gzip ? '.gz' : '')
				604	)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	605	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	606
				607	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	608	$pool->start and next ARCHIVE_LOOP;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	609
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	610	# Create temporary file
				611	$temp = File::Temp->newdir;
				612
Akron	bdf434a	2016-10-24 17:42:07 +0200	[diff] [blame]	613	# TODO: Check if $filename exist at the beginning,
				614	# because extraction can be horrible slow!
				615
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	616	# Extract from archive
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	617	if ($archive->extract_text($dirs[$i], $temp)) {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	618
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	619	# Create corpus directory
				620	my $input = catdir("$temp", $corpus);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	621
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	622	# Temporary directory
				623	my $dir = catdir($input, $doc, $text);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	624
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	625	# Write file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	626	if (my $return = $batch_file->process($dir => $filename)) {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	627	# Delete temporary file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	628	$pool->finish(
				629	0,
				630	["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
				631	);
				632	#$pool->finish(0, ["Processed " . $filename, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	633	}
				634	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	635	# Delete temporary file
				636	$pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	637	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	638	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	639
				640	# Unable to extract
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	641	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	642	$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	643	};
				644	};
				645	}
				646
				647	else {
				648	print "Input is neither a directory nor an archive.\n\n";
				649	};
				650
				651	$pool->wait_all_children;
				652
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	653	# Delete cache file
				654	unlink($cache_file) if $cache_delete;
				655
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	656	print "Done.\n";
				657	print timestr(timediff(Benchmark->new, $t))."\n\n";
				658	}
				659
				660	# Unknown command
				661	else {
				662	warn "Unknown command '$cmd'.\n\n";
				663	pod2usage(%ERROR_HASH);
				664	}
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	665
				666	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	667
				668	=pod
				669
				670	=encoding utf8
				671
				672	=head1 NAME
				673
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	674	korapxml2krill - Merge KorapXML data and create Krill documents
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	675
				676
				677	=head1 SYNOPSIS
				678
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	679	korapxml2krill [archive\|extract] --input <directory\|archive> [options]
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	680
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	681
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	682	=head1 DESCRIPTION
				683
				684	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				685	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	686	The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	687
				688
				689	=head1 INSTALLATION
				690
				691	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				692
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	693	$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	694
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	695	In case everything went well, the C<korapxml2krill> tool will
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	696	be available on your command line immediately.
Akron	7438151	2016-10-14 11:56:22 +0200	[diff] [blame]	697	Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	698	In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	699
				700	=head1 ARGUMENTS
				701
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	702	$ korapxml2krill -z --input <directory> --output <filename>
				703
				704	Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	705	It expects the input to point to the text level folder.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	706
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	707	=over 2
				708
				709	=item B<archive>
				710
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	711	$ korapxml2krill archive -z --input <directory\|archive> --output <directory>
				712
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	713	Converts an archive of KorAP-XML documents. It expects a directory
				714	(pointing to the corpus level folder) or one or more zip files as input.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	715
				716	=item B<extract>
				717
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	718	$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
				719
				720	Extracts KorAP-XML documents from a zip file.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	721
				722	=back
				723
				724
				725	=head1 OPTIONS
				726
				727	=over 2
				728
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	729	=item B<--input\|-i> <directory\|zip file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	730
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	731	Directory or zip file(s) of documents to convert.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	732
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	733	Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akron	f1a1de9	2016-11-02 17:32:12 +0100	[diff] [blame]	734	document, while C<archive> expects a KorAP-XML corpus folder or a zip
				735	file to batch process multiple files.
				736	C<extract> expects zip files only.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	737
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	738	C<archive> supports multiple input zip files with the constraint,
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	739	that the first archive listed contains all primary data files
				740	and all meta data files.
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	741
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	742	-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	743
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	744	Input may also be defined using BSD glob wildcards.
				745
				746	-i 'file/news*.zip'
				747
				748	The extended input array will be sorted in length order, so the shortest
				749	path needs to contain all primary data files and all meta data files.
				750
Akron	0c3e375	2016-06-28 15:55:53 +0200	[diff] [blame]	751	(The directory structure follows the base directory format,
				752	that may include a C<.> root folder.
				753	In this case further archives lacking a C<.> root folder
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	754	need to be passed with a hash sign in front of the archive's name.
				755	This may require to quote the parameter.)
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	756
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	757	To support zip files, a version of C<unzip> needs to be installed that is
				758	compatible with the archive file.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	759
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	760	B<The root folder switch using the hash sign is experimental and
				761	may vanish in future versions.>
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	762
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	763	=item B<--output\|-o> <directory\|file>
				764
				765	Output folder for archive processing or
				766	document name for single output (optional),
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	767	writes to C<STDOUT> by default
				768	(in case C<output> is not mandatory due to further options).
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	769
				770	=item B<--overwrite\|-w>
				771
				772	Overwrite files that already exist.
				773
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	774	=item B<--token\|-t> <foundry>#<file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	775
				776	Define the default tokenization by specifying
				777	the name of the foundry and optionally the name
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	778	of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	779
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	780
				781	=item B<--base-sentences\|-bs> <foundry>#<layer>
				782
				783	Define the layer for base sentences.
				784	If given, this will be used instead of using C<Base#Sentences>.
				785	Currently C<DeReKo#Structure> is the only additional layer supported.
				786
				787	Defaults to unset.
				788
				789
				790	=item B<--base-paragraphs\|-bp> <foundry>#<layer>
				791
				792	Define the layer for base paragraphs.
				793	If given, this will be used instead of using C<Base#Paragraphs>.
				794	Currently C<DeReKo#Structure> is the only additional layer supported.
				795
				796	Defaults to unset.
				797
				798
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	799	=item B<--base-pagebreaks\|-bpb> <foundry>#<layer>
				800
				801	Define the layer for base pagebreaks.
				802	Currently C<DeReKo#Structure> is the only layer supported.
				803
				804	Defaults to unset.
				805
				806
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	807	=item B<--skip\|-s> <foundry>[#<layer>]
				808
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	809	Skip specific annotations by specifying the foundry
				810	(and optionally the layer with a C<#>-prefix),
				811	e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	812	Can be set multiple times.
				813
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	814	=item B<--anno\|-a> <foundry>#<layer>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	815
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	816	Convert specific annotations by specifying the foundry
				817	(and optionally the layer with a C<#>-prefix),
				818	e.g. C<Mate> or C<Mate#Morpho>.
				819	Can be set multiple times.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	820
				821	=item B<--primary\|-p>
				822
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	823	Output primary data or not. Defaults to C<true>.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	824	Can be flagged using C<--no-primary> as well.
				825	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	826
				827	=item B<--jobs\|-j>
				828
				829	Define the number of concurrent jobs in seperated forks
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	830	for archive processing.
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	831	Defaults to C<0> (everything runs in a single process).
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	832	Pass -1, and the value will be set automatically to 5
				833	times the number of available cores.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	834	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	835
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	836	=item B<--meta\|-m>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	837
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	838	Define the metadata parser to use. Defaults to C<I5>.
				839	Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
				840	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	841
				842	=item B<--pretty\|-y>
				843
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	844	Pretty print JSON output. Defaults to C<false>.
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	845	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	846
				847	=item B<--gzip\|-z>
				848
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	849	Compress the output.
				850	Expects a defined C<output> file in single processing.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	851
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	852	=item B<--cache\|-c>
				853
				854	File to mmap a cache (using L<Cache::FastMmap>).
				855	Defaults to C<korapxml2krill.cache> in the calling directory.
				856
				857	=item B<--cache-size\|-cs>
				858
				859	Size of the cache. Defaults to C<50m>.
				860
				861	=item B<--cache-init\|-ci>
				862
				863	Initialize cache file.
				864	Can be flagged using C<--no-cache-init> as well.
				865	Defaults to C<true>.
				866
				867	=item B<--cache-delete\|-cd>
				868
				869	Delete cache file after processing.
				870	Can be flagged using C<--no-cache-delete> as well.
				871	Defaults to C<true>.
				872
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	873	=item B<--sigle\|-sg>
				874
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	875	Extract the given texts.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	876	Can be set multiple times.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	877	I<Currently only supported on C<extract>.>
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	878	Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	879	In case the C<Text> path is omitted, the whole document will be extracted.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	880	On the document level, the postfix wildcard C<*> is supported.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	881
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	882	=item B<--log\|-l>
				883
				884	The L<Log4perl> log level, defaults to C<ERROR>.
				885
				886	=item B<--help\|-h>
				887
				888	Print this document.
				889
				890	=item B<--version\|-v>
				891
				892	Print version information.
				893
				894	=back
				895
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	896	=head1 ANNOTATION SUPPORT
				897
				898	L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
				899	developed in the KorAP project that are part of the KorAP preprocessing pipeline.
				900	The base foundry with paragraphs, sentences, and the text element are mandatory for
				901	L<Krill\|https://github.com/KorAP/Krill>.
				902
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	903	Base
				904	#Paragraphs
				905	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	906
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	907	Connexor
				908	#Morpho
				909	#Phrase
				910	#Sentences
				911	#Syntax
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	912
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	913	CoreNLP
				914	#Constituency
				915	#Morpho
				916	#NamedEntities
				917	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	918
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	919	DeReKo
				920	#Structure
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	921
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	922	DRuKoLa
				923	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	924
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	925	Glemm
				926	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	927
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	928	Malt
				929	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	930
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	931	MarMoT
				932	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	933
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	934	Mate
				935	#Dependency
				936	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	937
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	938	MDParser
				939	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	940
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	941	OpenNLP
				942	#Morpho
				943	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	944
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	945	Sgbr
				946	#Lemma
				947	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	948
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	949	TreeTagger
				950	#Morpho
				951	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	952
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame^]	953	XIP
				954	#Constituency
				955	#Morpho
				956	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	957
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	958
				959	More importers are in preparation.
				960	New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
				961	See the built-in annotation importers as examples.
				962
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	963	=head1 AVAILABILITY
				964
				965	https://github.com/KorAP/KorAP-XML-Krill
				966
				967
				968	=head1 COPYRIGHT AND LICENSE
				969
Akron	3ec0a1c	2017-01-18 14:41:55 +0100	[diff] [blame]	970	Copyright (C) 2015-2017, L<IDS Mannheim\|http://www.ids-mannheim.de/>
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	971
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	972	Author: L<Nils Diewald\|http://nils-diewald.de/>
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	973	Contributor: Eliza Margaretha
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	974
				975	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				976	Corpus Analysis Platform at the
				977	L<Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				978	member of the
				979	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
				980
				981	This program is free software published under the
				982	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				983
				984	=cut