Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: 93e5eacec66b8748ecfa0d5dd981668f5209c1af [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	4	use FindBin;
				5	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				6	use File::Spec::Functions qw/catfile catdir/;
				7	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	8	use Benchmark qw/:hireswallclock/;
				9	use IO::Compress::Gzip qw/$GzipError/;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	10	use POSIX qw/ceil/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	11	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	12	use Pod::Usage;
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	13	use Cache::FastMmap;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	14	use Directory::Iterator;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	15	use KorAP::XML::Krill;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	16	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	17	use KorAP::XML::Tokenizer;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	18	use KorAP::XML::Batch::File;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	19	use Config::Simple;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	20	use Parallel::ForkManager;
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	21	use v5.10;
				22	use Sys::Info;
				23	use Sys::Info::Constants qw( :device_cpu );
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	24	use File::Glob ':bsd_glob';
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	25	use File::Temp qw/tempdir/;
				26
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	27
				28	# use KorAP::XML::ForkPool;
Akron	75ba57d	2016-03-07 23:36:27 +0100	[diff] [blame]	29	# TODO: use Parallel::Loops
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	30	# TODO: make output files
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	31
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	32	# TODO: Use KorAP::XML::ForkPool!
				33
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	34	# CHANGES:
				35	# ----------------------------------------------------------
				36	# 2013/11/25
				37	# - Initial release
				38	#
				39	# 2014/10/29
				40	# - Merges foundry data to create indexer friendly documents
				41	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	42	# 2016/02/04
				43	# - renamed to korapxml2krill
				44	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	45	#
				46	# 2016/02/12
				47	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	48	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	49	#
				50	# 2016/02/14
				51	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	52	# - Added support for archive files
				53	#
				54	# 2016/02/15
				55	# - Fixed temporary directory bug
				56	# - Improved skipping before unzipping
				57	# - Added EXPERIMENTAL concurrency support
				58	#
				59	# 2016/02/23
				60	# - Merge korapxml2krill and korapxml2krill_dir
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	61	#
				62	# 2016/02/27
				63	# - Added extract function
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	64	#
				65	# 2016/03/17
				66	# - Added meta switch
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	67	#
				68	# 2016/03/18
				69	# - Added meta data caching
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	70	#
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	71	# 2016/06/27
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	72	# - Added multi archive support
				73	# - Added prefix negation support
Akron	f3f0c94	2016-06-27 13:27:14 +0200	[diff] [blame]	74	# - Added Malt#Dependency support
Akron	8b99052	2016-07-06 16:45:57 +0200	[diff] [blame]	75	#
				76	# 2016/07/06
				77	# - Added MDParser#Dependency
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	78	#
				79	# 2016/10/15
Nils Diewald	0e48977	2016-10-24 15:16:52 +0200	[diff] [blame]	80	# - Fixed temporary path issue in script
				81	#
				82	# 2016/10/24
				83	# - Improved Windows support
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	84	#
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	85	# 2016/10/24
				86	# - Added support for document extraction
				87	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	88	# 2016/10/27
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	89	# - Added wildcard support for document extraction
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	90	#
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	91	# 2016/12/21
				92	# - added support for base-sentences and base-tokenizations
				93	#
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	94	# 2017/01/20
				95	# - added support for DRuKoLa annotations
				96	#
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	97	# 2017/02/08
				98	# - added support for pagebreak annotations
				99	#
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	100	# 2017/04/06
				101	# - added support for wildcards in input
				102	#
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	103	# 2017/04/07
				104	# - support configuration option
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	105	# - support for temporary extraction
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	106	#
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	107	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	108
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	109	our $LAST_CHANGE = '2017/04/07';
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	110	our $LOCAL = $FindBin::Bin;
				111	our $VERSION_MSG = <<"VERSION";
				112	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				113	VERSION
				114
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	115	# Parse comand
				116	my $cmd;
				117	our @ARGV;
				118	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				119	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	120	};
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	121
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	122	my (@skip, @sigle, @anno, @input);
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	123	my $text;
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	124
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	125	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	126	GetOptions(
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	127	'input\|i=s' => \@input,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	128	'output\|o=s' => \(my $output),
				129	'overwrite\|w' => \(my $overwrite),
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	130	'meta\|m=s' => \(my $meta),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	131	'token\|t=s' => \(my $token_base),
				132	'base-sentences\|bs=s' => \(my $base_sentences),
				133	'base-paragraphs\|bp=s' => \(my $base_paragraphs),
				134	'base-pagebreaks\|bpb=s' => \(my $base_pagebreaks),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	135	'gzip\|z' => \(my $gzip),
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	136	'temporary-extract\|te=s' => \(my $extract_dir),
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	137	'skip\|s=s' => \@skip,
				138	'sigle\|sg=s' => \@sigle,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	139	'cache\|c=s' => \(my $cache_file),
				140	'config\|cfg=s' => \(my $cfg_file),
				141	'log\|l=s' => \(my $log_level),
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	142	'anno\|a=s' => \@anno,
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	143	'primary\|p!' => \(my $primary),
				144	'pretty\|y' => \(my $pretty),
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	145	'jobs\|j=i' => \(my $jobs),
				146	'cache-size\|cs=s' => \(my $cache_size),
				147	'cache-delete\|cd!' => \(my $cache_delete),
				148	'cache-init\|ci!' => \(my $cache_init),
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	149	'help\|h' => sub {
				150	pod2usage(
				151	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	152	-verbose => 99,
				153	-msg => $VERSION_MSG,
				154	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	155	);
				156	},
				157	'version\|v' => sub {
				158	pod2usage(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	159	-verbose => 0,
				160	-msg => $VERSION_MSG,
				161	-output => '-'
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	162	)
				163	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	164	);
				165
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	166	# Load from configuration
				167	if ($cfg_file && -e $cfg_file) {
				168
				169	print "Reading config from $cfg_file\n";
				170
				171	my %config;
				172
				173	Config::Simple->import_from($cfg_file, \%config);
				174
				175	# Overwrite
				176	if (!defined($overwrite) && defined $config{overwrite}) {
				177	$overwrite = $config{overwrite};
				178	};
				179
				180	# Gzip
				181	if (!defined($gzip) && defined $config{gzip}) {
				182	$gzip = $config{gzip};
				183	};
				184
				185	# Jobs
				186	if (!defined($jobs) && defined $config{jobs}) {
				187	$jobs = $config{jobs};
				188	};
				189
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	190	# temporary-extract
				191	if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
				192	$extract_dir = $config{'temporary-extract'};
				193	};
				194
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	195	# Token base
				196	if (!defined($token_base) && defined $config{token}) {
				197	$token_base = $config{token};
				198	};
				199
				200	# Cache file
				201	if (!defined($cache_file) && defined $config{cache}) {
				202	$cache_file = $config{cache};
				203	};
				204
				205	# Cache size
				206	if (!defined($cache_size) && defined $config{'cache-size'}) {
				207	$cache_size = $config{'cache-size'};
				208	};
				209
				210	# Cache delete
				211	if (!defined($cache_delete) && defined $config{'cache-delete'}) {
				212	$cache_delete = $config{'cache-delete'} ;
				213	};
				214
				215	# Cache init
				216	if (!(defined $cache_init) && defined $config{'cache-init'}) {
				217	$cache_init = $config{'cache-init'} ;
				218	};
				219
				220	# Meta
				221	if (!(defined $meta) && defined $config{'meta'}) {
				222	$meta = $config{'meta'} ;
				223	};
				224
				225	# Output
				226	if (!(defined $output) && defined $config{'output'}) {
				227	$output = $config{'output'} ;
				228	};
				229
				230	# Base-sentences
				231	if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
				232	$base_sentences = $config{'base-sentences'} ;
				233	};
				234
				235	# Base-paragraphs
				236	if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
				237	$base_paragraphs = $config{'base-paragraphs'} ;
				238	};
				239
				240	# Base-pagebreaks
				241	if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
				242	$base_pagebreaks = $config{'base-pagebreaks'} ;
				243	};
				244
				245	# Log
				246	if (!(defined $log_level) && defined $config{'log'}) {
				247	$log_level = $config{'log'} ;
				248	};
				249
				250	# Skip
				251	if (!scalar(@skip) && defined $config{'skip'}) {
				252	@skip = split /\s;\s/, $config{'skip'} ;
				253	};
				254
				255	# Sigle
				256	if (!scalar(@sigle) && defined $config{'sigle'}) {
				257	@sigle = split /\s;\s/, $config{'sigle'} ;
				258	};
				259
				260	# Anno
				261	if (!scalar(@anno) && defined $config{'anno'}) {
				262	@anno = split /\s;\s/, $config{'anno'} ;
				263	};
				264	};
				265
				266	# Set default token base
				267	$token_base //= 'OpenNLP#tokens';
				268	$cache_file //= 'korapxml2krill.cache';
				269	$cache_size //= '50m';
				270	$jobs //= 0;
				271	$cache_delete //= 1;
				272	$cache_init //= 1;
				273	$log_level //= 'ERROR';
				274	$base_sentences //= '';
				275	$base_paragraphs //= '';
				276	$base_pagebreaks //= '';
				277
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	278	$base_sentences = lc $base_sentences;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	279	$base_paragraphs = lc $base_paragraphs;
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	280	$base_pagebreaks = lc $base_pagebreaks;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	281
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	282	my %ERROR_HASH = (
				283	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	284	-verbose => 99,
				285	-msg => $VERSION_MSG,
				286	-output => '-',
				287	-exit => 1
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	288	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	289
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	290	# Input has to be defined
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	291	pod2usage(%ERROR_HASH) unless @input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	292
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	293	# Gzip has no effect, if no output is given
				294	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	295
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	296	# Initialize log4perl object
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	297	Log::Log4perl->init({
				298	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
				299	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				300	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				301	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				302	});
				303
				304	my $log = Log::Log4perl->get_logger('main');
				305
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	306
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	307	if ($jobs eq '-1') {
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	308	state $cores = Sys::Info->new->device('CPU')->count;
				309	$jobs = ceil(5 * $cores);
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	310	$log->info("Run using $jobs jobs on $cores cores");
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	311	};
				312
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	313
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	314	my %skip;
				315	$skip{lc($_)} = 1 foreach @skip;
				316
				317	my @layers;
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	318	push(@layers, ['Base', 'Sentences']) unless $base_sentences;
				319	push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	320
				321	# Connexor
				322	push(@layers, ['Connexor', 'Morpho']);
				323	push(@layers, ['Connexor', 'Syntax']);
				324	push(@layers, ['Connexor', 'Phrase']);
				325	push(@layers, ['Connexor', 'Sentences']);
				326
				327	# CoreNLP
				328	push(@layers, ['CoreNLP', 'NamedEntities']);
				329	push(@layers, ['CoreNLP', 'Sentences']);
				330	push(@layers, ['CoreNLP', 'Morpho']);
				331	push(@layers, ['CoreNLP', 'Constituency']);
				332
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	333
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	334	# DeReKo
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	335	my @dereko_attr = ();
				336	if ($base_sentences eq 'dereko#structure') {
				337	push @dereko_attr, 'sentences';
				338	};
				339	if ($base_paragraphs eq 'dereko#structure') {
				340	push @dereko_attr, 'paragraphs';
				341	};
Akron	636bd9c	2017-02-09 17:13:00 +0100	[diff] [blame]	342
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	343	if ($base_pagebreaks eq 'dereko#structure') {
				344	push @dereko_attr, 'pagebreaks';
				345	};
				346
				347	if ($dereko_attr[0]) {
				348	push(@layers, ['DeReKo', 'Structure', 'base-' . join('-', @dereko_attr)]);
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	349	}
				350	else {
				351	push(@layers, ['DeReKo', 'Structure']);
				352	};
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	353
				354	# Glemm
				355	push(@layers, ['Glemm', 'Morpho']);
				356
				357	# Malt
				358	push(@layers, ['Malt', 'Dependency']);
				359
				360	# MDParser
				361	push(@layers, ['MDParser', 'Dependency']);
				362
				363	# Mate
				364	push(@layers, ['Mate', 'Morpho']);
				365	push(@layers, ['Mate', 'Dependency']);
				366
				367	# OpenNLP
				368	push(@layers, ['OpenNLP', 'Morpho']);
				369	push(@layers, ['OpenNLP', 'Sentences']);
				370
				371	# Schreibgebrauch
				372	push(@layers, ['Sgbr', 'Lemma']);
				373	push(@layers, ['Sgbr', 'Morpho']);
				374
				375	# TreeTagger
				376	push(@layers, ['TreeTagger', 'Morpho']);
				377	push(@layers, ['TreeTagger', 'Sentences']);
				378
				379	# XIP
				380	push(@layers, ['XIP', 'Morpho']);
				381	push(@layers, ['XIP', 'Constituency']);
				382	push(@layers, ['XIP', 'Sentences']);
				383	push(@layers, ['XIP', 'Dependency']);
				384
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	385	# DRuKoLa
				386	push(@layers, ['DRuKoLa', 'Morpho']);
				387
Akron	3bd942f	2017-02-20 20:09:14 +0100	[diff] [blame]	388	# Marmot
				389	push(@layers, ['MarMoT', 'Morpho']);
				390
Akron	4fa37c3	2017-01-20 14:43:10 +0100	[diff] [blame]	391
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	392	# Check filters
				393	my @filtered_anno;
				394	if ($skip{'#all'}) {
				395	foreach (@anno) {
				396	push @filtered_anno, [ split('#', $_) ];
				397	};
				398	}
				399
				400	# Add all annotations that are not skipped
				401	else {
				402	# Add to index file - respect skipping
				403	foreach my $info (@layers) {
				404	# Skip if Foundry or Foundry#Layer should be skipped
				405	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				406	push @filtered_anno, $info;
				407	};
				408	};
				409	};
				410
				411	# Get tokenization basis
				412	my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if ($token_base);
				413
				414	# TODO: This should not be initialized for batch
				415	my $cache = Cache::FastMmap->new(
				416	share_file => $cache_file,
				417	cache_size => $cache_size,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	418	init_file => $cache_init
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	419	);
				420
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	421	# Create batch object
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	422	my $batch_file = KorAP::XML::Batch::File->new(
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	423	cache => $cache,
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	424	meta_type => $meta,
				425	overwrite => $overwrite,
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	426	foundry => $token_base_foundry,
				427	layer => $token_base_layer,
				428	gzip => $gzip,
				429	log => $log,
				430	primary => $primary,
				431	pretty => $pretty,
				432	anno => \@filtered_anno
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	433	);
				434
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	435
				436	# Get file name based on path information
				437	sub get_file_name ($) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	438	my $i = $input[0];
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	439	if (-d $i) {
				440	$i =~ s![^\/]+$!!;
				441	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	442	my $file = shift;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	443
				444	# Remove temp dir fragments
Akron	6255760	2016-06-27 14:10:13 +0200	[diff] [blame]	445	$file =~ s!^/?tmp/[^/]+!!;
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	446	$file =~ s/^?\/?$i//;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	447	$file =~ tr/\//-/;
				448	$file =~ s{^-+}{};
Akron	b4bbec7	2016-10-26 20:21:02 +0200	[diff] [blame]	449	$file =~ s/^.*?-(.+?-.+?-.+?)$/$1/;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	450	return $file;
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	451	};
				452
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	453	# Convert sigle to path construct
				454	s!^\s([^_]+?)_([^\.]+?)\.(.+?)\s$!$1/$2/$3! foreach @sigle;
				455
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	456	if ($cmd) {
				457	if ($output && (!-e $output \|\| !-d $output)) {
				458	print "Directory '$output' does not exist.\n\n";
				459	exit(0);
				460	};
				461	};
				462
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	463	# Glob files
				464	if (@input) {
				465	my @new_input = ();
				466
				467	# Iterate over all inputs
				468	foreach (@input) {
				469	push (@new_input, bsd_glob($_));
				470	};
				471
				472	if (scalar(@new_input) > scalar(@input)) {
				473	@input = sort { length($a) <=> length($b) } @new_input;
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	474	print 'Input rewritten to ' . join(', ', @input)."\n";
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	475	};
				476	};
				477
				478
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	479	# Process a single file
				480	unless ($cmd) {
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	481	my $input = $input[0];
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	482
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	483	BEGIN {
				484	$main::TIME = Benchmark->new;
				485	$main::LAST_STOP = Benchmark->new;
				486	};
				487
				488	sub stop_time {
				489	my $new = Benchmark->new;
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	490	$log->info(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	491	'The code took: '.
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	492	timestr(timediff($new, $main::LAST_STOP)) .
				493	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				494	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	495	$main::LAST_STOP = $new;
				496	};
				497
				498	# Create and parse new document
				499	$input =~ s{([^/])$}{$1/};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	500
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	501	# Process file
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	502	$batch_file->process($input, $output);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	503
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	504	# Delete cache file
				505	unlink($cache_file) if $cache_delete;
				506
Akron	5f51d42	2016-08-16 16:26:43 +0200	[diff] [blame]	507	stop_time;
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	508	exit(1);
				509	};
				510
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	511
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	512	# Extract XML files
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	513	if ($cmd eq 'extract') {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	514
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	515	# Create new archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	516	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	517
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	518	# Check zip capabilities
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	519	unless ($archive->test_unzip) {
				520	print "Unzip is not installed or incompatible.\n\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	521	exit(0);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	522	};
				523
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	524	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	525	$archive->attach($_) foreach @input[1..$#input];
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	526
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	527	my $prefix = 1;
				528
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	529	# No sigles given
				530	unless (@sigle) {
				531
				532	# Get files
				533	foreach ($archive->list_texts) {
				534
				535	# Split path information
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	536	($prefix, my ($corpus, $doc, $text)) = $archive->split_path($_);
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	537
				538	# TODO: Make this OS independent
				539	push @sigle, join '/', $corpus, $doc, $text;
				540	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	541	}
				542
				543	# Check sigle for doc sigles
				544	else {
				545	my @new_sigle;
				546
				547	my $prefix_check = 0;
				548
				549	# Iterate over all sigle
				550	foreach (@sigle) {
				551
				552	# Sigle is a doc sigle
				553	if ($_ =~ m!^(?:\.[/\\])?[^/\\]+?[/\\][^/\\]+?$!) {
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	554
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	555	print "$_ ...";
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	556	# Check if a prefix is needed
				557	unless ($prefix_check) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	558
				559	if ($prefix = $archive->check_prefix) {
				560	print " with prefix ...";
				561	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	562	$prefix_check = 1;
				563	};
				564
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	565	print "\n";
				566
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	567	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	568	my $path = ($prefix ? './' : '') . $_;
				569
				570	print '... ' . (
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	571	$archive->extract_doc(
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	572	$path, $output, $jobs
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	573	) ? '' : 'not '
				574	);
				575	print "extracted.\n";
				576	}
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	577
				578	# Sigle is a text sigle
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	579	else {
				580	push @new_sigle, $_;
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	581
				582	unless ($prefix_check) {
				583
				584	if ($prefix = $archive->check_prefix) {
				585	print " with prefix ...";
				586	};
				587	$prefix_check = 1;
				588	};
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	589	};
				590	};
				591	@sigle = @new_sigle;
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	592	};
				593
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	594	# Iterate over all given sigles and extract
				595	foreach (@sigle) {
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	596
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	597	print "$_ ...\n";
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	598
Akron	03b24db	2016-08-16 20:54:32 +0200	[diff] [blame]	599	# TODO: Make this OS independent
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	600	print '... ' . (
Akron	60a8caa	2017-02-17 21:51:27 +0100	[diff] [blame]	601
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	602	$archive->extract_text(
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	603	($prefix ? './' : '') . $_, $output
				604	) ? '' : 'not '
				605	);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	606	print "extracted.\n";
				607	};
				608
				609	print "\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	610	# exit(1);
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	611	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	612
				613	# Can't create archive object
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	614	else {
				615	$log->error('Unable to extract from primary archive ' . $input[0]);
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	616	exit(1);
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	617	};
				618	}
				619
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	620
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	621	# Process an archive
				622	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	623
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	624	my $archive_output;
				625
				626	# First extract, then archive
				627	if (defined $extract_dir) {
				628
				629	# Create new archive object
				630	if (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
				631
				632	# Check zip capabilities
				633	unless ($archive->test_unzip) {
				634	print "Unzip is not installed or incompatible.\n\n";
				635	exit(0);
				636	};
				637
				638	# Add further annotation archived
				639	$archive->attach($_) foreach @input[1..$#input];
				640
				641	# Create a temporary directory
				642	if ($extract_dir eq ':temp:') {
				643	$extract_dir = tempdir(CLEANUP => 1);
				644	};
				645
				646	if ($archive->extract_all($extract_dir, $jobs)) {
				647	@input = ($extract_dir);
				648	}
				649	else {
				650	$log->error('Unable to extract from primary archive ' . $input[0] .
				651	' to ' . $extract_dir);
				652	exit(1);
				653	};
				654	}
				655
				656	# Can't create archive object
				657	else {
				658	$log->error('Unable to extract from primary archive ' . $input[0]);
				659	exit(1);
				660	};
				661	};
				662
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	663	# TODO: Support sigles
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	664
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	665	# Zero means: everything runs in the parent process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	666	my $pool = Parallel::ForkManager->new($jobs);
				667
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	668	my $count = 0; # Texts to process
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	669	my $iter = 1; # Current text in process
				670
				671	# Report on fork message
				672	$pool->run_on_finish (
				673	sub {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	674	my ($pid, $code) = @_;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	675	my $data = pop;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	676
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	677	print 'Convert ['. ($jobs > 0 ? "\$$pid:" : '') .
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	678	($iter++) . "/$count]" .
				679	($code ? " $code" : '') .
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	680	' ' . $data->[0] . "\n";
				681	$data->[1] = undef if $data->[1];
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	682	}
				683	);
				684
				685	my $t;
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	686	my $temp;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	687	print "Reading data ...\n";
				688
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	689	# unless (Cache::FastMmap->new(
				690	# share_file => $cache_file,
				691	# cache_size => $cache_size,
				692	# init_file => $cache_init
				693	# )) {
				694	# print "Unable to intialize cache '$cache_file'\n\n";
				695	# exit(1);
				696	# };
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	697
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	698	# Input is a directory
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	699	if (-d $input[0]) {
				700	my $it = Directory::Iterator->new($input[0]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	701	my @dirs;
				702	my $dir;
				703
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	704	# Todo: Make a DO WHILE
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	705	while (1) {
				706	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	707	push @dirs, $dir;
				708	$it->prune;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	709	};
				710	last unless $it->next;
				711	};
				712
				713	print "Start processing ...\n";
				714	$t = Benchmark->new;
				715	$count = scalar @dirs;
				716
				717	DIRECTORY_LOOP:
				718	for (my $i = 0; $i < $count; $i++) {
				719
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	720	my $filename = catfile(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	721	$output,
				722	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	723	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	724
				725	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	726	$pool->start and next DIRECTORY_LOOP;
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	727
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	728	if (my $return = $batch_file->process($dirs[$i] => $filename)) {
				729	$pool->finish(
				730	0,
				731	["Processed " . $filename . ($return == -1 ? " - already existing" : '')]
				732	);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	733	}
				734	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	735	$pool->finish(1, ["Unable to process " . $dirs[$i]]);
Akron	3ec4897	2016-08-17 23:24:52 +0200	[diff] [blame]	736	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	737	};
				738	}
				739
				740	# Input is a file
Akron	29866ac	2016-06-24 16:40:47 +0200	[diff] [blame]	741	elsif (-f($input[0]) && (my $archive = KorAP::XML::Archive->new($input[0]))) {
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	742
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	743	unless ($archive->test_unzip) {
				744	print "Unzip is not installed or incompatible.\n\n";
				745	exit(1);
				746	};
				747
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	748	# Add further annotation archived
Akron	2812ba2	2016-10-28 21:55:59 +0200	[diff] [blame]	749	$archive->attach($_) foreach @input[1..$#input];
Akron	08385f6	2016-03-22 20:37:04 +0100	[diff] [blame]	750
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	751	print "Start processing ...\n";
				752	$t = Benchmark->new;
				753	my @dirs = $archive->list_texts;
				754	$count = scalar @dirs;
				755
				756	ARCHIVE_LOOP:
				757	for (my $i = 0; $i < $count; $i++) {
				758
				759	# Split path information
				760	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				761
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	762	my $filename = catfile(
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	763	$output,
				764	get_file_name(
				765	catfile($corpus, $doc, $text)
				766	. '.json' . ($gzip ? '.gz' : '')
				767	)
Akron	e1dbc38	2016-07-08 22:24:52 +0200	[diff] [blame]	768	);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	769
				770	# Get the next fork
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	771	$pool->start and next ARCHIVE_LOOP;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	772
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	773	# Create temporary file
				774	$temp = File::Temp->newdir;
				775
Akron	bdf434a	2016-10-24 17:42:07 +0200	[diff] [blame]	776	# TODO: Check if $filename exist at the beginning,
				777	# because extraction can be horrible slow!
				778
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	779	# Extract from archive
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	780	if ($archive->extract_text($dirs[$i], $temp)) {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	781
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	782	# Create corpus directory
				783	my $input = catdir("$temp", $corpus);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	784
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	785	# Temporary directory
				786	my $dir = catdir($input, $doc, $text);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	787
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	788	# Write file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	789	if (my $return = $batch_file->process($dir => $filename)) {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	790	# Delete temporary file
Akron	13d5662	2016-10-31 14:54:49 +0100	[diff] [blame]	791	$pool->finish(
				792	0,
				793	["Processed " . $filename . ($return == -1 ? " - already existing" : ''), $temp]
				794	);
				795	#$pool->finish(0, ["Processed " . $filename, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	796	}
				797	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	798	# Delete temporary file
				799	$pool->finish(1, ["Unable to process " . $dir, $temp]);
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	800	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	801	}
Akron	7d4cdd8	2016-08-17 21:39:45 +0200	[diff] [blame]	802
				803	# Unable to extract
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	804	else {
Akron	4c0cf31	2016-10-15 16:42:09 +0200	[diff] [blame]	805	$pool->finish(1, ["Unable to extract " . $dirs[$i], $temp]);
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	806	};
				807	};
				808	}
				809
				810	else {
				811	print "Input is neither a directory nor an archive.\n\n";
				812	};
				813
				814	$pool->wait_all_children;
				815
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	816	# Delete cache file
				817	unlink($cache_file) if $cache_delete;
				818
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	819	print "Done.\n";
				820	print timestr(timediff(Benchmark->new, $t))."\n\n";
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	821	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	822
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	823
				824	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	825
				826	=pod
				827
				828	=encoding utf8
				829
				830	=head1 NAME
				831
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	832	korapxml2krill - Merge KorapXML data and create Krill documents
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	833
				834
				835	=head1 SYNOPSIS
				836
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	837	korapxml2krill [archive\|extract] --input <directory\|archive> [options]
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	838
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	839
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	840	=head1 DESCRIPTION
				841
				842	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				843	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	844	The C<korapxml2krill> command line tool is a simple wrapper to the library.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	845
				846
				847	=head1 INSTALLATION
				848
				849	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				850
Akron	af38698	2016-10-12 00:33:25 +0200	[diff] [blame]	851	$ cpanm https://github.com/KorAP/KorAP-XML-Krill.git
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	852
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	853	In case everything went well, the C<korapxml2krill> tool will
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	854	be available on your command line immediately.
Akron	7438151	2016-10-14 11:56:22 +0200	[diff] [blame]	855	Minimum requirement for L<KorAP::XML::Krill> is Perl 5.14.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	856	In addition to work with zip archives, the C<unzip> tool needs to be present.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	857
				858	=head1 ARGUMENTS
				859
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	860	$ korapxml2krill -z --input <directory> --output <filename>
				861
				862	Without arguments, C<korapxml2krill> converts a directory of a single KorAP-XML document.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	863	It expects the input to point to the text level folder.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	864
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	865	=over 2
				866
				867	=item B<archive>
				868
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	869	$ korapxml2krill archive -z --input <directory\|archive> --output <directory>
				870
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	871	Converts an archive of KorAP-XML documents. It expects a directory
				872	(pointing to the corpus level folder) or one or more zip files as input.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	873
				874	=item B<extract>
				875
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	876	$ korapxml2krill extract --input <archive> --output <directory> --sigle <SIGLE>
				877
				878	Extracts KorAP-XML documents from a zip file.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	879
				880	=back
				881
				882
				883	=head1 OPTIONS
				884
				885	=over 2
				886
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	887	=item B<--input\|-i> <directory\|zip file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	888
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	889	Directory or zip file(s) of documents to convert.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	890
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	891	Without arguments, C<korapxml2krill> expects a folder of a single KorAP-XML
Akron	f1a1de9	2016-11-02 17:32:12 +0100	[diff] [blame]	892	document, while C<archive> expects a KorAP-XML corpus folder or a zip
				893	file to batch process multiple files.
				894	C<extract> expects zip files only.
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	895
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	896	C<archive> supports multiple input zip files with the constraint,
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	897	that the first archive listed contains all primary data files
				898	and all meta data files.
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	899
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	900	-i file/news.zip -i file/news.malt.zip -i "#file/news.tt.zip"
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	901
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	902	Input may also be defined using BSD glob wildcards.
				903
				904	-i 'file/news*.zip'
				905
				906	The extended input array will be sorted in length order, so the shortest
				907	path needs to contain all primary data files and all meta data files.
				908
Akron	0c3e375	2016-06-28 15:55:53 +0200	[diff] [blame]	909	(The directory structure follows the base directory format,
				910	that may include a C<.> root folder.
				911	In this case further archives lacking a C<.> root folder
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	912	need to be passed with a hash sign in front of the archive's name.
				913	This may require to quote the parameter.)
Akron	2cfe809	2016-06-24 17:48:49 +0200	[diff] [blame]	914
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	915	To support zip files, a version of C<unzip> needs to be installed that is
				916	compatible with the archive file.
Akron	a93d51b	2016-10-24 20:27:48 +0200	[diff] [blame]	917
Akron	7606afa	2016-10-25 16:23:49 +0200	[diff] [blame]	918	B<The root folder switch using the hash sign is experimental and
				919	may vanish in future versions.>
Akron	651cb8d	2016-08-16 21:44:49 +0200	[diff] [blame]	920
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	921	=item B<--output\|-o> <directory\|file>
				922
				923	Output folder for archive processing or
				924	document name for single output (optional),
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	925	writes to C<STDOUT> by default
				926	(in case C<output> is not mandatory due to further options).
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	927
				928	=item B<--overwrite\|-w>
				929
				930	Overwrite files that already exist.
				931
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	932	=item B<--token\|-t> <foundry>#<file>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	933
				934	Define the default tokenization by specifying
				935	the name of the foundry and optionally the name
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	936	of the layer-file. Defaults to C<OpenNLP#tokens>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	937
Akron	3741f8b	2016-12-21 19:55:21 +0100	[diff] [blame]	938
				939	=item B<--base-sentences\|-bs> <foundry>#<layer>
				940
				941	Define the layer for base sentences.
				942	If given, this will be used instead of using C<Base#Sentences>.
				943	Currently C<DeReKo#Structure> is the only additional layer supported.
				944
				945	Defaults to unset.
				946
				947
				948	=item B<--base-paragraphs\|-bp> <foundry>#<layer>
				949
				950	Define the layer for base paragraphs.
				951	If given, this will be used instead of using C<Base#Paragraphs>.
				952	Currently C<DeReKo#Structure> is the only additional layer supported.
				953
				954	Defaults to unset.
				955
				956
Akron	41ac10b	2017-02-08 22:47:25 +0100	[diff] [blame]	957	=item B<--base-pagebreaks\|-bpb> <foundry>#<layer>
				958
				959	Define the layer for base pagebreaks.
				960	Currently C<DeReKo#Structure> is the only layer supported.
				961
				962	Defaults to unset.
				963
				964
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	965	=item B<--skip\|-s> <foundry>[#<layer>]
				966
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	967	Skip specific annotations by specifying the foundry
				968	(and optionally the layer with a C<#>-prefix),
				969	e.g. C<Mate> or C<Mate#Morpho>. Alternatively you can skip C<#ALL>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	970	Can be set multiple times.
				971
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	972	=item B<--anno\|-a> <foundry>#<layer>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	973
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	974	Convert specific annotations by specifying the foundry
				975	(and optionally the layer with a C<#>-prefix),
				976	e.g. C<Mate> or C<Mate#Morpho>.
				977	Can be set multiple times.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	978
				979	=item B<--primary\|-p>
				980
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	981	Output primary data or not. Defaults to C<true>.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	982	Can be flagged using C<--no-primary> as well.
				983	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	984
				985	=item B<--jobs\|-j>
				986
				987	Define the number of concurrent jobs in seperated forks
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	988	for archive processing.
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	989	Defaults to C<0> (everything runs in a single process).
Akron	c11f798	2017-02-21 21:20:14 +0100	[diff] [blame]	990	Pass -1, and the value will be set automatically to 5
				991	times the number of available cores.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	992	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	993
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	994	=item B<--meta\|-m>
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	995
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	996	Define the metadata parser to use. Defaults to C<I5>.
				997	Metadata parsers can be defined in the C<KorAP::XML::Meta> namespace.
				998	This is I<experimental>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	999
				1000	=item B<--pretty\|-y>
				1001
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1002	Pretty print JSON output. Defaults to C<false>.
Akron	35db6e3	2016-03-17 22:42:22 +0100	[diff] [blame]	1003	This is I<deprecated>.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1004
				1005	=item B<--gzip\|-z>
				1006
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1007	Compress the output.
				1008	Expects a defined C<output> file in single processing.
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1009
Akron	11c8030	2016-03-18 19:44:43 +0100	[diff] [blame]	1010	=item B<--cache\|-c>
				1011
				1012	File to mmap a cache (using L<Cache::FastMmap>).
				1013	Defaults to C<korapxml2krill.cache> in the calling directory.
				1014
				1015	=item B<--cache-size\|-cs>
				1016
				1017	Size of the cache. Defaults to C<50m>.
				1018
				1019	=item B<--cache-init\|-ci>
				1020
				1021	Initialize cache file.
				1022	Can be flagged using C<--no-cache-init> as well.
				1023	Defaults to C<true>.
				1024
				1025	=item B<--cache-delete\|-cd>
				1026
				1027	Delete cache file after processing.
				1028	Can be flagged using C<--no-cache-delete> as well.
				1029	Defaults to C<true>.
				1030
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1031	=item B<--config\|-cfg>
				1032
				1033	Configure the parameters of your call in a file
				1034	of key-value pairs with whitespace separator
				1035
				1036	overwrite 1
				1037	token DeReKo#Structure
				1038	...
				1039
				1040	Supported parameters are:
				1041	C<overwrite>, C<gzip>, C<jobs>,
				1042	C<token>, C<log>, C<cache>, C<cache-size>, C<cache-delete>, C<meta>,
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	1043	C<output>, C<base-sentences>, C<temp-extract>, C<base-paragraphs>,
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1044	C<base-pagebreaks>, C<skip> (semicolon separated), C<sigle>
				1045	(semicolon separated), C<anno> (semicolon separated).
				1046
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	1047	=item B<--temporary-extract\|-te>
				1048
				1049	Only valid for the C<archive> command.
				1050
				1051	This will first extract all files into a
				1052	directory and then will archive.
				1053	If the directory is given as C<:temp:>,
				1054	a temporary directory is used.
				1055	This is especially useful to avoid
				1056	massive unzipping and potential
				1057	network latency.
Akron	636aa11	2017-04-07 18:48:56 +0200	[diff] [blame]	1058
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1059	=item B<--sigle\|-sg>
				1060
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1061	Extract the given texts.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1062	Can be set multiple times.
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1063	I<Currently only supported on C<extract>.>
Akron	b0c88db	2016-06-29 16:33:18 +0200	[diff] [blame]	1064	Sigles have the structure C<Corpus>/C<Document>/C<Text>.
Akron	2080758	2016-10-26 17:11:34 +0200	[diff] [blame]	1065	In case the C<Text> path is omitted, the whole document will be extracted.
Akron	2fd402b	2016-10-27 21:26:48 +0200	[diff] [blame]	1066	On the document level, the postfix wildcard C<*> is supported.
Akron	e10ad32	2016-02-27 10:54:26 +0100	[diff] [blame]	1067
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1068	=item B<--log\|-l>
				1069
				1070	The L<Log4perl> log level, defaults to C<ERROR>.
				1071
				1072	=item B<--help\|-h>
				1073
				1074	Print this document.
				1075
				1076	=item B<--version\|-v>
				1077
				1078	Print version information.
				1079
				1080	=back
				1081
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1082	=head1 ANNOTATION SUPPORT
				1083
				1084	L<KorAP::XML::Krill> has built-in importer for some annotation foundries and layers
				1085	developed in the KorAP project that are part of the KorAP preprocessing pipeline.
				1086	The base foundry with paragraphs, sentences, and the text element are mandatory for
				1087	L<Krill\|https://github.com/KorAP/Krill>.
				1088
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1089	Base
				1090	#Paragraphs
				1091	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1092
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1093	Connexor
				1094	#Morpho
				1095	#Phrase
				1096	#Sentences
				1097	#Syntax
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1098
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1099	CoreNLP
				1100	#Constituency
				1101	#Morpho
				1102	#NamedEntities
				1103	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1104
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1105	DeReKo
				1106	#Structure
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1107
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1108	DRuKoLa
				1109	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1110
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1111	Glemm
				1112	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1113
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1114	Malt
				1115	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1116
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1117	MarMoT
				1118	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1119
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1120	Mate
				1121	#Dependency
				1122	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1123
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1124	MDParser
				1125	#Dependency
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1126
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1127	OpenNLP
				1128	#Morpho
				1129	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1130
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1131	Sgbr
				1132	#Lemma
				1133	#Morpho
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1134
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1135	TreeTagger
				1136	#Morpho
				1137	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1138
Akron	821db3d	2017-04-06 21:19:31 +0200	[diff] [blame]	1139	XIP
				1140	#Constituency
				1141	#Morpho
				1142	#Sentences
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1143
Akron	c13a170	2016-03-15 19:33:14 +0100	[diff] [blame]	1144
				1145	More importers are in preparation.
				1146	New annotation importers can be defined in the C<KorAP::XML::Annotation> namespace.
				1147	See the built-in annotation importers as examples.
				1148
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1149	=head1 AVAILABILITY
				1150
				1151	https://github.com/KorAP/KorAP-XML-Krill
				1152
				1153
				1154	=head1 COPYRIGHT AND LICENSE
				1155
Akron	3ec0a1c	2017-01-18 14:41:55 +0100	[diff] [blame]	1156	Copyright (C) 2015-2017, L<IDS Mannheim\|http://www.ids-mannheim.de/>
Akron	f7ad89e	2016-03-16 18:22:47 +0100	[diff] [blame]	1157
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1158	Author: L<Nils Diewald\|http://nils-diewald.de/>
Akron	8150010	2017-04-07 20:45:44 +0200	[diff] [blame^]	1159
Akron	a76d835	2016-10-27 16:27:32 +0200	[diff] [blame]	1160	Contributor: Eliza Margaretha
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame]	1161
				1162	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				1163	Corpus Analysis Platform at the
				1164	L<Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				1165	member of the
				1166	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
				1167
				1168	This program is free software published under the
				1169	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				1170
				1171	=cut