Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: 289c2f468348e2782bd54e7430906185b9d7334e [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	4	use FindBin;
				5	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				6	use File::Spec::Functions qw/catfile catdir/;
				7	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	8	use Benchmark qw/:hireswallclock/;
				9	use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	10	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	11	use Pod::Usage;
				12	use Directory::Iterator;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	13	use KorAP::XML::Krill;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	14	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	15	use KorAP::XML::Tokenizer;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	16	use Parallel::ForkManager;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	17
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	18	# CHANGES:
				19	# ----------------------------------------------------------
				20	# 2013/11/25
				21	# - Initial release
				22	#
				23	# 2014/10/29
				24	# - Merges foundry data to create indexer friendly documents
				25	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	26	# 2016/02/04
				27	# - renamed to korapxml2krill
				28	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	29	#
				30	# 2016/02/12
				31	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	32	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	33	#
				34	# 2016/02/14
				35	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	36	# - Added support for archive files
				37	#
				38	# 2016/02/15
				39	# - Fixed temporary directory bug
				40	# - Improved skipping before unzipping
				41	# - Added EXPERIMENTAL concurrency support
				42	#
				43	# 2016/02/23
				44	# - Merge korapxml2krill and korapxml2krill_dir
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	45	#
				46	# 2016/02/27
				47	# - Added extract function
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	48	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	49
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	50	our $LAST_CHANGE = '2016/02/27';
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	51	our $LOCAL = $FindBin::Bin;
				52	our $VERSION_MSG = <<"VERSION";
				53	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				54	VERSION
				55
				56
				57	# Parse comand
				58	my $cmd;
				59	our @ARGV;
				60	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				61	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	62	};
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	63
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	64	my (@skip, @sigle);
				65
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	66	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	67	GetOptions(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	68	'input\|i=s' => \(my $input),
				69	'output\|o=s' => \(my $output),
				70	'overwrite\|w' => \(my $overwrite),
				71	'human\|m' => \(my $text),
				72	'token\|t=s' => \(my $token_base),
				73	'gzip\|z' => \(my $gzip),
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	74	'skip\|s=s' => \@skip,
				75	'sigle\|sg=s' => \@sigle,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	76	'log\|l=s' => \(my $log_level = 'ERROR'),
				77	'allow\|a=s' => \(my @allow),
				78	'primary\|p!' => \(my $primary),
				79	'pretty\|y' => \(my $pretty),
				80	'jobs\|j=i' => \(my $jobs = 0),
				81	'help\|h' => sub {
				82	pod2usage(
				83	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				84	-verbose => 99,
				85	-msg => $VERSION_MSG,
				86	);
				87	},
				88	'version\|v' => sub {
				89	pod2usage(
				90	-verbose => 0,
				91	-msg => $VERSION_MSG
				92	)
				93	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	94	);
				95
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	96	my %ERROR_HASH = (
				97	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				98	-verbose => 99,
				99	-msg => $VERSION_MSG,
				100	-exit => 1
				101	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	102
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	103	# Input has to be defined
				104	pod2usage(%ERROR_HASH) unless $input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	105
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	106
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	107	# Initialize log4perl object
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	108	Log::Log4perl->init({
				109	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
				110	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				111	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				112	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				113	});
				114
				115	my $log = Log::Log4perl->get_logger('main');
				116
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	117
				118	# Get file name based on path information
				119	sub get_file_name ($) {
				120	my $file = shift;
				121	$file =~ s/^?\/?$input//;
				122	$file =~ tr/\//-/;
				123	$file =~ s{^-+}{};
				124	return $file;
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	125	};
				126
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	127
				128	# Write file
				129	sub write_file {
				130	my $anno = shift;
				131	my $file = get_file_name $anno;
				132
				133	# TODO: This should be done directly with a data structure! KorAP::XML::Wrap
				134
				135	my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
				136	$anno . ' -o ' . $output . '/' . $file . '.json';
				137	$call .= '.gz -z' if $gzip;
				138	$call .= ' -m' if $text;
				139	$call .= ' -w' if $overwrite;
				140	$call .= ' -t ' . $token_base if $token_base;
				141	$call .= ' -l ' . $log_level if $log_level;
				142	$call .= ' --no-primary ' if $primary;
				143	$call .= ' -y ' . $pretty if $pretty;
				144	$call .= ' -a ' . $_ foreach @allow;
				145	$call .= ' -s ' . $_ foreach @skip;
				146	system($call);
				147	return "$file";
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	148	};
				149
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	150
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	151	# Convert sigle to path construct
				152	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;
				153
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	154	# Process a single file
				155	unless ($cmd) {
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	156
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	157	# Can't print gzip to STDOUT
				158	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	159
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	160	my %skip;
				161	$skip{lc($_)} = 1 foreach @skip;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	162
				163
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	164	# Ignore processing
				165	if (!$overwrite && $output && -e $output) {
				166	$log->trace($output . ' already exists');
				167	exit(0);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	168	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	169
				170	BEGIN {
				171	$main::TIME = Benchmark->new;
				172	$main::LAST_STOP = Benchmark->new;
				173	};
				174
				175	sub stop_time {
				176	my $new = Benchmark->new;
				177	$log->trace(
				178	'The code took: '.
				179	timestr(timediff($new, $main::LAST_STOP)) .
				180	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				181	);
				182	$main::LAST_STOP = $new;
				183	};
				184
				185	# Create and parse new document
				186	$input =~ s{([^/])$}{$1/};
				187	my $doc = KorAP::XML::Krill->new( path => $input );
				188
				189	unless ($doc->parse) {
				190	$log->warn($output . " can't be processed - no document data");
				191	exit(0);
				192	};
				193
				194	my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
				195	if ($token_base) {
				196	($token_base_foundry, $token_base_layer) = split /#/, $token_base;
				197	};
				198
				199	# Get tokenization
				200	my $tokens = KorAP::XML::Tokenizer->new(
				201	path => $doc->path,
				202	doc => $doc,
				203	foundry => $token_base_foundry,
				204	layer => $token_base_layer,
				205	name => 'tokens'
				206	);
				207
				208	# Unable to process base tokenization
				209	unless ($tokens->parse) {
				210	$log->error($output . " can't be processed - no base tokenization");
				211	exit(0);
				212	};
				213
				214	my @layers;
				215	push(@layers, ['Base', 'Sentences']);
				216	push(@layers, ['Base', 'Paragraphs']);
				217
				218	# Connexor
				219	push(@layers, ['Connexor', 'Morpho']);
				220	push(@layers, ['Connexor', 'Syntax']);
				221	push(@layers, ['Connexor', 'Phrase']);
				222	push(@layers, ['Connexor', 'Sentences']);
				223
				224	# CoreNLP
				225	push(@layers, ['CoreNLP', 'NamedEntities']);
				226	push(@layers, ['CoreNLP', 'Sentences']);
				227	push(@layers, ['CoreNLP', 'Morpho']);
				228	push(@layers, ['CoreNLP', 'Constituency']);
				229
				230	# DeReKo
				231	push(@layers, ['DeReKo', 'Structure']);
				232
				233	# Glemm
				234	push(@layers, ['Glemm', 'Morpho']);
				235
				236	# Malt
				237	# push(@layers, ['Malt', 'Dependency']);
				238
				239	# Mate
				240	push(@layers, ['Mate', 'Morpho']);
				241	push(@layers, ['Mate', 'Dependency']);
				242
				243	# OpenNLP
				244	push(@layers, ['OpenNLP', 'Morpho']);
				245	push(@layers, ['OpenNLP', 'Sentences']);
				246
				247	# Schreibgebrauch
				248	push(@layers, ['Sgbr', 'Lemma']);
				249	push(@layers, ['Sgbr', 'Morpho']);
				250
				251	# TreeTagger
				252	push(@layers, ['TreeTagger', 'Morpho']);
				253	push(@layers, ['TreeTagger', 'Sentences']);
				254
				255	# XIP
				256	push(@layers, ['XIP', 'Morpho']);
				257	push(@layers, ['XIP', 'Constituency']);
				258	push(@layers, ['XIP', 'Sentences']);
				259	push(@layers, ['XIP', 'Dependency']);
				260
				261
				262	if ($skip{'#all'}) {
				263	foreach (@allow) {
				264	$tokens->add(split('#', $_));
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	265	stop_time;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	266	};
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	267	}
				268	else {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	269	# Add to index file - respect skipping
				270	foreach my $info (@layers) {
				271	# Skip if Foundry or Foundry#Layer should be skipped
				272	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				273	$tokens->add(@$info);
				274	stop_time;
				275	};
				276	};
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	277	};
				278
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	279	my $file;
				280
				281	my $print_text = $text ? $tokens->to_string($primary) :
				282	($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
				283
				284	if ($output) {
				285
				286	if ($gzip) {
				287	$file = IO::Compress::Gzip->new($output, Minimal => 1);
				288	}
				289	else {
				290	$file = IO::File->new($output, "w");
				291	};
				292
				293	$file->print($print_text);
				294	$file->close;
				295	}
				296
				297	else {
				298	print $print_text . "\n";
				299	};
				300
				301	stop_time;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	302	}
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	303
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	304	# Extract XML files
				305	elsif ($cmd eq 'extract') {
				306
				307	pod2usage(%ERROR_HASH) unless $output;
				308
				309	# TODO: Support sigles and full archives
				310
				311	if ($output && (!-e $output \|\| !-d $output)) {
				312	print "Directory '$output' does not exist.\n\n";
				313	exit(0);
				314	};
				315
				316	if (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
				317
				318	unless ($archive->test_unzip) {
				319	print "Unzip is not installed or incompatible.\n\n";
				320	exit(1);
				321	};
				322
				323	# Test will be skipped
				324
				325	# Iterate over all given sigles and extract
				326	foreach (@sigle) {
				327	print "$_ ";
				328	print '' . ($archive->extract('./'. $_, $output) ? '' : 'not ');
				329	print "extracted.\n";
				330	};
				331
				332	print "\n";
				333	exit(1);
				334	};
				335	}
				336
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	337	# Process an archive
				338	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	339
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	340	# TODO: Support sigles
				341
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	342	pod2usage(%ERROR_HASH) unless $output;
				343
				344	if ($output && (!-e $output \|\| !-d $output)) {
				345	print "Directory '$output' does not exist.\n\n";
				346	exit(0);
				347	};
				348
				349	# Zero means: everything runs in the parent process
				350	my $pool = Parallel::ForkManager->new($jobs);
				351
				352	my $count = 0; # Texts to process
				353	my $iter = 1; # Current text in process
				354
				355	# Report on fork message
				356	$pool->run_on_finish (
				357	sub {
				358	my ($pid, $code) = shift;
				359	my $data = pop;
				360	print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
				361	($iter++) . "/$count]" .
				362	($code ? " $code" : '') .
				363	" $$data\n";
				364	}
				365	);
				366
				367	my $t;
				368	print "Reading data ...\n";
				369
				370	# Input is a directory
				371	if (-d $input) {
				372	my $it = Directory::Iterator->new($input);
				373	my @dirs;
				374	my $dir;
				375
				376	while (1) {
				377	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
				378	push @dirs, $dir;
				379	$it->prune;
				380	};
				381	last unless $it->next;
				382	};
				383
				384	print "Start processing ...\n";
				385	$t = Benchmark->new;
				386	$count = scalar @dirs;
				387
				388	DIRECTORY_LOOP:
				389	for (my $i = 0; $i < $count; $i++) {
				390
				391	unless ($overwrite) {
				392	my $filename = catfile(
				393	$output,
				394	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
				395	);
				396
				397	if (-e $filename) {
				398	$iter++;
				399	print "Skip $filename\n";
				400	next;
				401	};
				402	};
				403
				404	# Get the next fork
				405	my $pid = $pool->start and next DIRECTORY_LOOP;
				406	my $msg;
				407
				408	$msg = write_file($dirs[$i]);
				409	$pool->finish(0, \$msg);
				410	};
				411	}
				412
				413	# Input is a file
				414	elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
				415	unless ($archive->test_unzip) {
				416	print "Unzip is not installed or incompatible.\n\n";
				417	exit(1);
				418	};
				419
				420	unless ($archive->test) {
				421	print "Zip archive not compatible.\n\n";
				422	exit(1);
				423	};
				424
				425	print "Start processing ...\n";
				426	$t = Benchmark->new;
				427	my @dirs = $archive->list_texts;
				428	$count = scalar @dirs;
				429
				430	ARCHIVE_LOOP:
				431	for (my $i = 0; $i < $count; $i++) {
				432
				433	# Split path information
				434	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				435
				436	unless ($overwrite) {
				437	my $filename = catfile(
				438	$output,
				439	get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
				440	);
				441
				442	if (-e $filename) {
				443	$iter++;
				444	print "Skip $filename\n";
				445	next;
				446	};
				447	};
				448
				449	# Get the next fork
				450	my $pid = $pool->start and next ARCHIVE_LOOP;
				451
				452	# Create temporary file
				453	my $temp = File::Temp->newdir;
				454
				455	my $msg;
				456
				457	# Extract from archive
				458	if ($archive->extract($dirs[$i], $temp)) {
				459
				460	# Create corpus directory
				461	$input = catdir("$temp", $corpus);
				462
				463	# Temporary directory
				464	my $dir = catdir($input, $doc, $text);
				465
				466	# Write file
				467	$msg = write_file($dir);
				468
				469	$temp = undef;
				470	$pool->finish(0, \$msg);
				471	}
				472	else {
				473
				474	$temp = undef;
				475	$msg = "Unable to extract " . $dirs[$i] . "\n";
				476	$pool->finish(1, \$msg);
				477	};
				478	};
				479	}
				480
				481	else {
				482	print "Input is neither a directory nor an archive.\n\n";
				483	};
				484
				485	$pool->wait_all_children;
				486
				487	print "Done.\n";
				488	print timestr(timediff(Benchmark->new, $t))."\n\n";
				489	}
				490
				491	# Unknown command
				492	else {
				493	warn "Unknown command '$cmd'.\n\n";
				494	pod2usage(%ERROR_HASH);
				495	}
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	496
				497	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	498
				499	=pod
				500
				501	=encoding utf8
				502
				503	=head1 NAME
				504
				505	korapxml2krill - Merge KorapXML data and create Krill friendly documents
				506
				507
				508	=head1 SYNOPSIS
				509
				510	$ korapxml2krill [archive] -z --input <directory> --output <filename>
				511
				512
				513	=head1 DESCRIPTION
				514
				515	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				516	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
				517
				518
				519	=head1 INSTALLATION
				520
				521	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				522
				523	$ cpanm https://github.com/KorAP/KorAP-XML-Krill
				524
				525	In case everything went well, the C<korapxml2krill> command line tool will
				526	be available.
				527
				528
				529	=head1 ARGUMENTS
				530
				531	=over 2
				532
				533	=item B<archive>
				534
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	535	Process an archive as a Zip-file or a folder of KorAP-XML documents.
				536
				537	=item B<extract>
				538
				539	Extract KorAP-XML files from a Zip-file.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	540
				541	=back
				542
				543
				544	=head1 OPTIONS
				545
				546	=over 2
				547
				548	=item B<--input\|-i> <directory\|file>
				549
				550	Directory or archive file of documents to index.
				551
				552	=item B<--output\|-o> <directory\|file>
				553
				554	Output folder for archive processing or
				555	document name for single output (optional),
				556	writes to <STDOUT> by default.
				557
				558	=item B<--overwrite\|-w>
				559
				560	Overwrite files that already exist.
				561
				562	=item B<--token\|-t> <foundry>[#<file>]
				563
				564	Define the default tokenization by specifying
				565	the name of the foundry and optionally the name
				566	of the layer-file. Defaults to OpenNLP#tokens.
				567
				568	=item B<--skip\|-s> <foundry>[#<layer>]
				569
				570	Skip specific foundries by specifying the name
				571	or specific layers by defining the name
				572	with a # in front of the foundry,
				573	e.g. Mate#Morpho. Alternatively you can skip #ALL.
				574	Can be set multiple times.
				575
				576	=item B<--allow\|-a> <foundry>#<layer>
				577
				578	Allow specific foundries and layers by defining them
				579	combining the foundry name with a # and the layer name.
				580
				581	=item B<--primary\|-p>
				582
				583	Output primary data or not. Defaults to true.
				584	Can be flagged using --no-primary as well.
				585
				586	=item B<--jobs\|-j>
				587
				588	Define the number of concurrent jobs in seperated forks
				589	for archive processing, defaults to 0. This is B<EXPERIMENTAL>!
				590
				591	=item B<--human\|-m>
				592
				593	Represent the data human friendly, while the output defaults to JSON.
				594
				595	=item B<--pretty\|-y>
				596
				597	Pretty print JSON output.
				598
				599	=item B<--gzip\|-z>
				600
				601	Compress the output (expects a defined output file in single processing).
				602
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame^]	603	=item B<--sigle\|-sg>
				604
				605	Extract the given text sigles.
				606	Currently only supported on C<extract>.
				607	Can be set multiple times.
				608
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	609	=item B<--log\|-l>
				610
				611	The L<Log4perl> log level, defaults to C<ERROR>.
				612
				613	=item B<--help\|-h>
				614
				615	Print this document.
				616
				617	=item B<--version\|-v>
				618
				619	Print version information.
				620
				621	=back
				622
				623	=head1 AVAILABILITY
				624
				625	https://github.com/KorAP/KorAP-XML-Krill
				626
				627
				628	=head1 COPYRIGHT AND LICENSE
				629
				630	Copyright (C) 2015-2016, L<IDS Mannheim\|http://www.ids-mannheim.de/>
				631	Author: L<Nils Diewald\|http://nils-diewald.de/>
				632
				633	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				634	Corpus Analysis Platform at the
				635	L<Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				636	member of the
				637	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
				638
				639	This program is free software published under the
				640	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				641
				642	=cut