Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: b61ae04b3f41f203253a85cf57ac319daeb68d06 [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	4	use FindBin;
				5	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				6	use File::Spec::Functions qw/catfile catdir/;
				7	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	8	use Benchmark qw/:hireswallclock/;
				9	use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	10	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	11	use Pod::Usage;
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	12	use Cache::FastMmap;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	13	use Directory::Iterator;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	14	use KorAP::XML::Krill;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	15	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	16	use KorAP::XML::Tokenizer;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	17	use KorAP::XML::Batch::File;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	18	use Parallel::ForkManager;
Akron	75ba57d	2016-03-07 23:36:27 +0100	[diff] [blame]	19	# TODO: use Parallel::Loops
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	20	# TODO: make output files
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	21
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	22	# CHANGES:
				23	# ----------------------------------------------------------
				24	# 2013/11/25
				25	# - Initial release
				26	#
				27	# 2014/10/29
				28	# - Merges foundry data to create indexer friendly documents
				29	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	30	# 2016/02/04
				31	# - renamed to korapxml2krill
				32	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	33	#
				34	# 2016/02/12
				35	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	36	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	37	#
				38	# 2016/02/14
				39	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	40	# - Added support for archive files
				41	#
				42	# 2016/02/15
				43	# - Fixed temporary directory bug
				44	# - Improved skipping before unzipping
				45	# - Added EXPERIMENTAL concurrency support
				46	#
				47	# 2016/02/23
				48	# - Merge korapxml2krill and korapxml2krill_dir
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	49	#
				50	# 2016/02/27
				51	# - Added extract function
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	52	#
				53	# 2016/03/17
				54	# - Added meta switch
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	55	#
				56	# 2016/03/18
				57	# - Added meta data caching
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	58	#
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	59	# 2016/06/27
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	60	# - Added multi archive support
				61	# - Added prefix negation support
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	62	# - Added Malt#Dependency support
Akron	8b99052	2016-07-06 16:45:57 +0200	[diff] [blame]	63	#
				64	# 2016/07/06
				65	# - Added MDParser#Dependency
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	66	#
				67	# 2016/10/15
Nils Diewald	0e48977	2016-10-24 15:16:52 +0200	[diff] [blame]	68	# - Fixed temporary path issue in script
				69	#
				70	# 2016/10/24
				71	# - Improved Windows support
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	72	#
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	73	# 2016/10/24
				74	# - Added support for document extraction
				75	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	76	# 2016/10/27
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	77	# - Added wildcard support for document extraction
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	78	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	79	# 2016/12/21
				80	# - added support for base-sentences and base-tokenizations
				81	#
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	82	# 2017/01/20
				83	# - added support for DRuKoLa annotations
				84	#
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	85	# 2017/02/08
				86	# - added support for pagebreak annotations
				87	#
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	88	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	89
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	90	our $LAST_CHANGE = '2017/02/08';
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	91	our $LOCAL = $FindBin::Bin;
				92	our $VERSION_MSG = <<"VERSION";
				93	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				94	VERSION
				95
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	96	# Parse comand
				97	my $cmd;
				98	our @ARGV;
				99	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				100	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	101	};
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	102
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	103	my (@skip, @sigle, @anno, @input);
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	104	my $text;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	105
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	106	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	107	GetOptions(
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	108	'input\|i=s' => \@input,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	109	'output\|o=s' => \(my $output),
				110	'overwrite\|w' => \(my $overwrite),
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	111	'meta\|m=s' => \(my $meta),
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	112	'token\|t=s' => \(my $token_base = 'OpenNLP#tokens'),
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	113	'base-sentences\|bs=s' => \(my $base_sentences = ''),
				114	'base-paragraphs\|bp=s' => \(my $base_paragraphs = ''),
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	115	'base-pagebreaks\|bpb=s' => \(my $base_pagebreaks = ''),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	116	'gzip\|z' => \(my $gzip),
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	117	'skip\|s=s' => \@skip,
				118	'sigle\|sg=s' => \@sigle,
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	119	'cache\|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	120	'log\|l=s' => \(my $log_level = 'ERROR'),
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	121	'anno\|a=s' => \@anno,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	122	'primary\|p!' => \(my $primary),
				123	'pretty\|y' => \(my $pretty),
				124	'jobs\|j=i' => \(my $jobs = 0),
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	125	'cache-size\|cs=s' => \(my $cache_size = '50m'),
				126	'cache-delete\|cd!' => \(my $cache_delete = 1),
				127	'cache-init\|ci!' => \(my $cache_init = 1),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	128	'help\|h' => sub {
				129	pod2usage(
				130	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	131	-verbose => 99,
				132	-msg => $VERSION_MSG,
				133	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	134	);
				135	},
				136	'version\|v' => sub {
				137	pod2usage(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	138	-verbose => 0,
				139	-msg => $VERSION_MSG,
				140	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	141	)
				142	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	143	);
				144
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	145	$base_sentences = lc $base_sentences;
				146	$base_paragraphs = lc $base_paragraphs;
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	147	$base_pagebreaks = lc $base_pagebreaks;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	148
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	149	my %ERROR_HASH = (
				150	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	151	-verbose => 99,
				152	-msg => $VERSION_MSG,
				153	-output => '-',
				154	-exit => 1
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	155	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	156
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	157	# Input has to be defined
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	158	pod2usage(%ERROR_HASH) unless @input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	159
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	160	# Gzip has no effect, if no output is given
				161	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	162
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	163	# Initialize log4perl object
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	164	Log::Log4perl->init({
				165	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
				166	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				167	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				168	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				169	});
				170
				171	my $log = Log::Log4perl->get_logger('main');
				172
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	173	my %skip;
				174	$skip{lc($_)} = 1 foreach @skip;
				175
				176	my @layers;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	177	push(@layers, ['Base', 'Sentences']) unless $base_sentences;
				178	push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	179
				180	# Connexor
				181	push(@layers, ['Connexor', 'Morpho']);
				182	push(@layers, ['Connexor', 'Syntax']);
				183	push(@layers, ['Connexor', 'Phrase']);
				184	push(@layers, ['Connexor', 'Sentences']);
				185
				186	# CoreNLP
				187	push(@layers, ['CoreNLP', 'NamedEntities']);
				188	push(@layers, ['CoreNLP', 'Sentences']);
				189	push(@layers, ['CoreNLP', 'Morpho']);
				190	push(@layers, ['CoreNLP', 'Constituency']);
				191
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	192
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	193	# DeReKo
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	194	my @dereko_attr = ();
				195	if ($base_sentences eq 'dereko#structure') {
				196	push @dereko_attr, 'sentences';
				197	};
				198	if ($base_paragraphs eq 'dereko#structure') {
				199	push @dereko_attr, 'paragraphs';
				200	};
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	201
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	202	if ($base_pagebreaks eq 'dereko#structure') {
				203	push @dereko_attr, 'pagebreaks';
				204	};
				205
				206	if ($dereko_attr[0]) {
				207	push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	208	}
				209	else {
				210	push(@layers, ['DeReKo', 'Structure']);
				211	};
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	212
				213	# Glemm
				214	push(@layers, ['Glemm', 'Morpho']);
				215
				216	# Malt
				217	push(@layers, ['Malt', 'Dependency']);
				218
				219	# MDParser
				220	push(@layers, ['MDParser', 'Dependency']);
				221
				222	# Mate
				223	push(@layers, ['Mate', 'Morpho']);
				224	push(@layers, ['Mate', 'Dependency']);
				225
				226	# OpenNLP
				227	push(@layers, ['OpenNLP', 'Morpho']);
				228	push(@layers, ['OpenNLP', 'Sentences']);
				229
				230	# Schreibgebrauch
				231	push(@layers, ['Sgbr', 'Lemma']);
				232	push(@layers, ['Sgbr', 'Morpho']);
				233
				234	# TreeTagger
				235	push(@layers, ['TreeTagger', 'Morpho']);
				236	push(@layers, ['TreeTagger', 'Sentences']);
				237
				238	# XIP
				239	push(@layers, ['XIP', 'Morpho']);
				240	push(@layers, ['XIP', 'Constituency']);
				241	push(@layers, ['XIP', 'Sentences']);
				242	push(@layers, ['XIP', 'Dependency']);
				243
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	244	# DRuKoLa
				245	push(@layers, ['DRuKoLa', 'Morpho']);
				246
Akron	3bd942f	2017-02-20 20:09:14 +0100	[diff] [blame^]	247	# Marmot
				248	push(@layers, ['MarMoT', 'Morpho']);
				249
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	250
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	251	# Check filters
				252	my @filtered_anno;
				253	if ($skip{'#all'}) {
				254	foreach (@anno) {
				255	push @filtered_anno, [ split('#', $_) ];
				256	};
				257	}
				258
				259	# Add all annotations that are not skipped
				260	else {
				261	# Add to index file - respect skipping
				262	foreach my $info (@layers) {
				263	# Skip if Foundry or Foundry#Layer should be skipped
				264	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				265	push @filtered_anno, $info;
				266	};
				267	};
				268	};
				269
				270	# Get tokenization basis
				271	my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
				272
				273	# TODO: This should not be initialized for batch
				274	my $cache = Cache::FastMmap->new(
				275	share_file => $cache_file,
				276	cache_size => $cache_size,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	277	init_file => $cache_init
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	278	);
				279
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	280	# Create batch object
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	281	my $batch_file = KorAP::XML::Batch::File->new(
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	282	cache => $cache,
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	283	meta_type => $meta,
				284	overwrite => $overwrite,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	285	foundry => $token_base_foundry,
				286	layer => $token_base_layer,
				287	gzip => $gzip,
				288	log => $log,
				289	primary => $primary,
				290	pretty => $pretty,
				291	anno => \@filtered_anno
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	292	);
				293
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	294
				295	# Get file name based on path information
				296	sub get_file_name ($) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	297	my $i = $input[0];
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	298	if (-d $i) {
				299	$i =~ s![^\/]+$!!;
				300	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	301	my $file = shift;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	302
				303	# Remove temp dir fragments
Akron	6255760	2016-06-27 14:10:13 +0200	[diff] [blame]	304	$file =~ s!^/?tmp/[^/]+!!;
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	305	$file =~ s/^?\/?$i//;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	306	$file =~ tr/\//-/;
				307	$file =~ s{^-+}{};
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	308	$file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	309	return $file;
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	310	};
				311
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	312	# Convert sigle to path construct
				313	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;
				314
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	315	if ($cmd) {
				316	if ($output && (!-e $output \|\| !-d $output)) {
				317	print "Directory '$output' does not exist.\n\n";
				318	exit(0);
				319	};
				320	};
				321
				322
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	323	# Process a single file
				324	unless ($cmd) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	325	my $input = $input[0];
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	326
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	327	BEGIN {
				328	$main::TIME = Benchmark->new;
				329	$main::LAST_STOP = Benchmark->new;
				330	};
				331
				332	sub stop_time {
				333	my $new = Benchmark->new;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	334	$log->info(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	335	'The code took: '.
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	336	timestr(timediff($new, $main::LAST_STOP)) .
				337	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				338	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	339	$main::LAST_STOP = $new;
				340	};
				341
				342	# Create and parse new document
				343	$input =~ s{([^/])$}{$1/};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	344
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	345	# Process file
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	346	$batch_file->process($input, $output);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	347
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	348	# Delete cache file
				349	unlink($cache_file) if $cache_delete;
				350
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	351	stop_time;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	352	}
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	353
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	354	# Extract XML files
				355	elsif ($cmd eq 'extract') {
				356
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	357	# Create new archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	358	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	359
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	360	# Check zip capabilities
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	361	unless ($archive->test_unzip) {
				362	print "Unzip is not installed or incompatible.\n\n";
				363	exit(1);
				364	};
				365
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	366	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	367	$archive->attach($_) foreach @input[1..$#input];
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	368
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	369	my $prefix = 1;
				370
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	371	# No sigles given
				372	unless (@sigle) {
				373
				374	# Get files
				375	foreach ($archive->list_texts) {
				376
				377	# Split path information
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	378	($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	379
				380	# TODO: Make this OS independent
				381	push @sigle, join '/', $corpus, $doc, $text;
				382	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	383	}
				384
				385	# Check sigle for doc sigles
				386	else {
				387	my @new_sigle;
				388
				389	my $prefix_check = 0;
				390
				391	# Iterate over all sigle
				392	foreach (@sigle) {
				393
				394	# Sigle is a doc sigle
				395	if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	396
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	397	print "$_ ...";
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	398	# Check if a prefix is needed
				399	unless ($prefix_check) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	400
				401	if ($prefix = $archive->check_prefix) {
				402	print " with prefix ...";
				403	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	404	$prefix_check = 1;
				405	};
				406
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	407	print "\n";
				408
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	409	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	410	my $path = ($prefix ? './' : '') . $_;
				411
				412	print '... ' . (
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	413	$archive->extract_doc(
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	414	$path, $output, $jobs
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	415	) ? '' : 'not '
				416	);
				417	print "extracted.\n";
				418	}
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	419
				420	# Sigle is a text sigle
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	421	else {
				422	push @new_sigle, $_;
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	423
				424	unless ($prefix_check) {
				425
				426	if ($prefix = $archive->check_prefix) {
				427	print " with prefix ...";
				428	};
				429	$prefix_check = 1;
				430	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	431	};
				432	};
				433	@sigle = @new_sigle;
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	434	};
				435
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	436	# Iterate over all given sigles and extract
				437	foreach (@sigle) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	438
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	439	print "$_ ...\n";
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	440
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	441	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	442	print '... ' . (
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	443
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	444	$archive->extract_text(
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	445	($prefix ? './' : '') . $_, $output
				446	) ? '' : 'not '
				447	);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	448	print "extracted.\n";
				449	};
				450
				451	print "\n";
				452	exit(1);
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	453	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	454
				455	# Can't create archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	456	else {
				457	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	458	};
				459	}
				460
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	461	# Process an archive
				462	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	463
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	464	# TODO: Support sigles
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	465
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	466	# Zero means: everything runs in the parent process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	467	my $pool = Parallel::ForkManager->new($jobs);
				468
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	469	my $count = 0; # Texts to process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	470	my $iter = 1; # Current text in process
				471
				472	# Report on fork message
				473	$pool->run_on_finish (
				474	sub {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	475	my ($pid, $code) = @_;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	476	my $data = pop;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	477
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	478	print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	479	($iter++) . "/$count]" .
				480	($code ? " $code" : '') .
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	481	' ' . $data->[0] . "\n";
				482	$data->[1] = undef if $data->[1];
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	483	}
				484	);
				485
				486	my $t;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	487	my $temp;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	488	print "Reading data ...\n";
				489
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	490	# unless (Cache::FastMmap->new(
				491	# share_file => $cache_file,
				492	# cache_size => $cache_size,
				493	# init_file => $cache_init
				494	# )) {
				495	# print "Unable to intialize cache '$cache_file'\n\n";
				496	# exit(1);
				497	# };
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	498
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	499	# Input is a directory
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	500	if (-d $input[0]) {
				501	my $it = Directory::Iterator->new($input[0]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	502	my @dirs;
				503	my $dir;
				504
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	505	# Todo: Make a DO WHILE
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	506	while (1) {
				507	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	508	push @dirs, $dir;
				509	$it->prune;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	510	};
				511	last unless $it->next;
				512	};
				513
				514	print "Start processing ...\n";
				515	$t = Benchmark->new;
				516	$count = scalar @dirs;
				517
				518	DIRECTORY_LOOP:
				519	for (my $i = 0; $i < $count; $i++) {
				520
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	521	my $filename = catfile(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	522	$output,
				523	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	524	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	525
				526	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	527	$pool->start and next DIRECTORY_LOOP;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	528
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	529	if (my $return = $batch_file->process($dirs[$i] => $filename)) {
				530	$pool->finish(
				531	0,
				532	["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
				533	);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	534	}
				535	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	536	$pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	537	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	538	};
				539	}
				540
				541	# Input is a file
Akron	29866ac	2016-06-24 16:40:47 +0200	[diff] [blame]	542	elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	543
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	544	unless ($archive->test_unzip) {
				545	print "Unzip is not installed or incompatible.\n\n";
				546	exit(1);
				547	};
				548
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	549	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	550	$archive->attach($_) foreach @input[1..$#input];
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	551
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	552	print "Start processing ...\n";
				553	$t = Benchmark->new;
				554	my @dirs = $archive->list_texts;
				555	$count = scalar @dirs;
				556
				557	ARCHIVE_LOOP:
				558	for (my $i = 0; $i < $count; $i++) {
				559
				560	# Split path information
				561	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				562
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	563	my $filename = catfile(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	564	$output,
				565	get_file_name(
				566	catfile($corpus, $doc, $text)
				567	. '.json' . ($gzip ? '.gz' : '')
				568	)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	569	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	570
				571	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	572	$pool->start and next ARCHIVE_LOOP;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	573
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	574	# Create temporary file
				575	$temp = File::Temp->newdir;
				576
Akron	bdf434a	2016-10-24 17:42:07 +0200	[diff] [blame]	577	# TODO: Check if $filename exist at the beginning,
				578	# because extraction can be horrible slow!
				579
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	580	# Extract from archive
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	581	if ($archive->extract_text($dirs[$i], $temp)) {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	582
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	583	# Create corpus directory
				584	my $input = catdir("$temp", $corpus);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	585
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	586	# Temporary directory
				587	my $dir = catdir($input, $doc, $text);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	588
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	589	# Write file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	590	if (my $return = $batch_file->process($dir => $filename)) {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	591	# Delete temporary file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	592	$pool->finish(
				593	0,
				594	["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
				595	);
				596	#$pool->finish(0, ["Processed " . $filename, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	597	}
				598	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	599	# Delete temporary file
				600	$pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	601	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	602	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	603
				604	# Unable to extract
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	605	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	606	$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	607	};
				608	};
				609	}
				610
				611	else {
				612	print "Input is neither a directory nor an archive.\n\n";
				613	};
				614
				615	$pool->wait_all_children;
				616
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	617	# Delete cache file
				618	unlink($cache_file) if $cache_delete;
				619
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	620	print "Done.\n";
				621	print timestr(timediff(Benchmark->new, $t))."\n\n";
				622	}
				623
				624	# Unknown command
				625	else {
				626	warn "Unknown command '$cmd'.\n\n";
				627	pod2usage(%ERROR_HASH);
				628	}
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	629
				630	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	631
				632	=pod
				633
				634	=encoding utf8
				635
				636	=head1 NAME
				637
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	638	korapxml2krill - Merge KorapXML data and create Krill documents
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	639
				640
				641	=head1 SYNOPSIS
				642
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	643	korapxml2krill [archive\|extract] --input <directory\|archive> [options]
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	644
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	645
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	646	=head1 DESCRIPTION
				647
				648	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				649	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	650	The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	651
				652
				653	=head1 INSTALLATION
				654
				655	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				656
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	657	$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	658
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	659	In case everything went well, the C<korapxml2krill> tool will
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	660	be available on your command line immediately.
Akron	7438151	2016-10-14 11:56:22 +0200	[diff] [blame]	661	Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	662	In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	663
				664	=head1 ARGUMENTS
				665
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	666	$ korapxml2krill -z --input <directory> --output <filename>
				667
				668	Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	669	It expects the input to point to the text level folder.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	670
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	671	=over 2
				672
				673	=item B<archive>
				674
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	675	$ korapxml2krill archive -z --input <directory\|archive> --output <directory>
				676
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	677	Converts an archive of KorAP-XML documents. It expects a directory
				678	(pointing to the corpus level folder) or one or more zip files as input.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	679
				680	=item B<extract>
				681
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	682	$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
				683
				684	Extracts KorAP-XML documents from a zip file.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	685
				686	=back
				687
				688
				689	=head1 OPTIONS
				690
				691	=over 2
				692
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	693	=item B<--input\|-i> <directory\|zip file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	694
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	695	Directory or zip file(s) of documents to convert.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	696
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	697	Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akron	f1a1de9	2016-11-02 17:32:12 +0100	[diff] [blame]	698	document, while C<archive> expects a KorAP-XML corpus folder or a zip
				699	file to batch process multiple files.
				700	C<extract> expects zip files only.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	701
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	702	C<archive> supports multiple input zip files with the constraint,
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	703	that the first archive listed contains all primary data files
				704	and all meta data files.
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	705
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	706	-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	707
Akron	0c3e375	2016-06-28 15:55:53 +0200	[diff] [blame]	708	(The directory structure follows the base directory format,
				709	that may include a C<.> root folder.
				710	In this case further archives lacking a C<.> root folder
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	711	need to be passed with a hash sign in front of the archive's name.
				712	This may require to quote the parameter.)
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	713
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	714	To support zip files, a version of C<unzip> needs to be installed that is
				715	compatible with the archive file.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	716
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	717	B<The root folder switch using the hash sign is experimental and
				718	may vanish in future versions.>
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	719
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	720	=item B<--output\|-o> <directory\|file>
				721
				722	Output folder for archive processing or
				723	document name for single output (optional),
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	724	writes to C<STDOUT> by default
				725	(in case C<output> is not mandatory due to further options).
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	726
				727	=item B<--overwrite\|-w>
				728
				729	Overwrite files that already exist.
				730
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	731	=item B<--token\|-t> <foundry>#<file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	732
				733	Define the default tokenization by specifying
				734	the name of the foundry and optionally the name
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	735	of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	736
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	737
				738	=item B<--base-sentences\|-bs> <foundry>#<layer>
				739
				740	Define the layer for base sentences.
				741	If given, this will be used instead of using C<Base#Sentences>.
				742	Currently C<DeReKo#Structure> is the only additional layer supported.
				743
				744	Defaults to unset.
				745
				746
				747	=item B<--base-paragraphs\|-bp> <foundry>#<layer>
				748
				749	Define the layer for base paragraphs.
				750	If given, this will be used instead of using C<Base#Paragraphs>.
				751	Currently C<DeReKo#Structure> is the only additional layer supported.
				752
				753	Defaults to unset.
				754
				755
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	756	=item B<--base-pagebreaks\|-bpb> <foundry>#<layer>
				757
				758	Define the layer for base pagebreaks.
				759	Currently C<DeReKo#Structure> is the only layer supported.
				760
				761	Defaults to unset.
				762
				763
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	764	=item B<--skip\|-s> <foundry>[#<layer>]
				765
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	766	Skip specific annotations by specifying the foundry
				767	(and optionally the layer with a C<#>-prefix),
				768	e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	769	Can be set multiple times.
				770
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	771	=item B<--anno\|-a> <foundry>#<layer>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	772
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	773	Convert specific annotations by specifying the foundry
				774	(and optionally the layer with a C<#>-prefix),
				775	e.g. C<Mate> or C<Mate#Morpho>.
				776	Can be set multiple times.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	777
				778	=item B<--primary\|-p>
				779
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	780	Output primary data or not. Defaults to C<true>.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	781	Can be flagged using C<--no-primary> as well.
				782	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	783
				784	=item B<--jobs\|-j>
				785
				786	Define the number of concurrent jobs in seperated forks
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	787	for archive processing.
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	788	Defaults to C<0> (everything runs in a single process).
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	789	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	790
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	791	=item B<--meta\|-m>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	792
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	793	Define the metadata parser to use. Defaults to C<I5>.
				794	Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
				795	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	796
				797	=item B<--pretty\|-y>
				798
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	799	Pretty print JSON output. Defaults to C<false>.
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	800	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	801
				802	=item B<--gzip\|-z>
				803
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	804	Compress the output.
				805	Expects a defined C<output> file in single processing.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	806
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	807	=item B<--cache\|-c>
				808
				809	File to mmap a cache (using L<Cache::FastMmap>).
				810	Defaults to C<korapxml2krill.cache> in the calling directory.
				811
				812	=item B<--cache-size\|-cs>
				813
				814	Size of the cache. Defaults to C<50m>.
				815
				816	=item B<--cache-init\|-ci>
				817
				818	Initialize cache file.
				819	Can be flagged using C<--no-cache-init> as well.
				820	Defaults to C<true>.
				821
				822	=item B<--cache-delete\|-cd>
				823
				824	Delete cache file after processing.
				825	Can be flagged using C<--no-cache-delete> as well.
				826	Defaults to C<true>.
				827
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	828	=item B<--sigle\|-sg>
				829
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	830	Extract the given texts.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	831	Can be set multiple times.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	832	I<Currently only supported on C<extract>.>
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	833	Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	834	In case the C<Text> path is omitted, the whole document will be extracted.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	835	On the document level, the postfix wildcard C<*> is supported.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	836
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	837	=item B<--log\|-l>
				838
				839	The L<Log4perl> log level, defaults to C<ERROR>.
				840
				841	=item B<--help\|-h>
				842
				843	Print this document.
				844
				845	=item B<--version\|-v>
				846
				847	Print version information.
				848
				849	=back
				850
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	851	=head1 ANNOTATION SUPPORT
				852
				853	L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
				854	developed in the KorAP project that are part of the KorAP preprocessing pipeline.
				855	The base foundry with paragraphs, sentences, and the text element are mandatory for
				856	L<Krill\|https://github.com/KorAP/Krill>.
				857
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	858	=over 2
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	859
				860	=item B<Base>
				861
				862	=over 4
				863
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	864	=item #Paragraphs
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	865
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	866	=item #Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	867
				868	=back
				869
				870	=item B<Connexor>
				871
				872	=over 4
				873
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	874	=item #Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	875
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	876	=item #Phrase
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	877
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	878	=item #Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	879
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	880	=item #Syntax
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	881
				882	=back
				883
				884	=item B<CoreNLP>
				885
				886	=over 4
				887
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	888	=item #Constituency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	889
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	890	=item #Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	891
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	892	=item #NamedEntities
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	893
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	894	=item #Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	895
				896	=back
				897
				898	=item B<DeReKo>
				899
				900	=over 4
				901
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	902	=item #Structure
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	903
				904	=back
				905
				906	=item B<Glemm>
				907
				908	=over 4
				909
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	910	=item #Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	911
				912	=back
				913
				914	=item B<Mate>
				915
				916	=over 4
				917
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	918	=item #Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	919
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	920	=item #Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	921
				922	=back
				923
				924	=item B<OpenNLP>
				925
				926	=over 4
				927
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	928	=item #Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	929
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	930	=item #Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	931
				932	=back
				933
				934	=item B<Sgbr>
				935
				936	=over 4
				937
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	938	=item #Lemma
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	939
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	940	=item #Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	941
				942	=back
				943
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	944	=item B<DRuKoLa>
				945
				946	=over 4
				947
				948	=item #Morpho
				949
				950	=back
				951
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	952	=item B<TreeTagger>
				953
				954	=over 4
				955
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	956	=item #Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	957
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	958	=item #Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	959
				960	=back
				961
				962	=item B<XIP>
				963
				964	=over 4
				965
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	966	=item #Constituency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	967
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	968	=item #Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	969
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	970	=item #Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	971
				972	=back
				973
				974	=back
				975
				976	More importers are in preparation.
				977	New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
				978	See the built-in annotation importers as examples.
				979
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	980	=head1 AVAILABILITY
				981
				982	https://github.com/KorAP/KorAP-XML-Krill
				983
				984
				985	=head1 COPYRIGHT AND LICENSE
				986
Akron	3ec0a1c	2017-01-18 14:41:55 +0100	[diff] [blame]	987	Copyright (C) 2015-2017, L<IDS Mannheim\|http://www.ids-mannheim.de/>
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	988
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	989	Author: L<Nils Diewald\|http://nils-diewald.de/>
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	990	Contributor: Eliza Margaretha
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	991
				992	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				993	Corpus Analysis Platform at the
				994	L<Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				995	member of the
				996	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
				997
				998	This program is free software published under the
				999	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				1000
				1001	=cut