Blame - script/korapxml2krill - KorAP/KorAP-XML-Krill

blob: 03a808893243dba53cb01ebfe1a81797e7a59eca [file] [log] [blame]

Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	1	#!/usr/bin/env perl
				2	use strict;
				3	use warnings;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	4	use FindBin;
				5	BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
				6	use File::Spec::Functions qw/catfile catdir/;
				7	use Getopt::Long qw/GetOptions :config no_auto_abbrev/;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	8	use Benchmark qw/:hireswallclock/;
				9	use IO::Compress::Gzip qw/$GzipError/;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	10	use Log::Log4perl;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	11	use Pod::Usage;
				12	use Directory::Iterator;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	13	use KorAP::XML::Krill;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	14	use KorAP::XML::Archive;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	15	use KorAP::XML::Tokenizer;
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	16	use Parallel::ForkManager;
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	17
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	18	# CHANGES:
				19	# ----------------------------------------------------------
				20	# 2013/11/25
				21	# - Initial release
				22	#
				23	# 2014/10/29
				24	# - Merges foundry data to create indexer friendly documents
				25	#
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	26	# 2016/02/04
				27	# - renamed to korapxml2krill
				28	# - added Schreibgebrauch support
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	29	#
				30	# 2016/02/12
				31	# - fixed foundry skipping
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	32	# - Support overwrite in archive processing
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	33	#
				34	# 2016/02/14
				35	# - Added version information
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	36	# - Added support for archive files
				37	#
				38	# 2016/02/15
				39	# - Fixed temporary directory bug
				40	# - Improved skipping before unzipping
				41	# - Added EXPERIMENTAL concurrency support
				42	#
				43	# 2016/02/23
				44	# - Merge korapxml2krill and korapxml2krill_dir
				45	# ----------------------------------------------------------
Akron	069bd71	2016-02-12 19:09:06 +0100	[diff] [blame]	46
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	47	our $LAST_CHANGE = '2016/02/23';
				48	our $LOCAL = $FindBin::Bin;
				49	our $VERSION_MSG = <<"VERSION";
				50	Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
				51	VERSION
				52
				53
				54	# Parse comand
				55	my $cmd;
				56	our @ARGV;
				57	if ($ARGV[0] && index($ARGV[0], '-') != 0) {
				58	$cmd = shift @ARGV;
Akron	150b29e	2016-02-14 23:06:48 +0100	[diff] [blame]	59	};
Akron	93d620e	2016-02-05 19:40:05 +0100	[diff] [blame]	60
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	61	# Parse options from the command line
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	62	GetOptions(
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	63	'input\|i=s' => \(my $input),
				64	'output\|o=s' => \(my $output),
				65	'overwrite\|w' => \(my $overwrite),
				66	'human\|m' => \(my $text),
				67	'token\|t=s' => \(my $token_base),
				68	'gzip\|z' => \(my $gzip),
				69	'skip\|s=s' => \(my @skip),
				70	'log\|l=s' => \(my $log_level = 'ERROR'),
				71	'allow\|a=s' => \(my @allow),
				72	'primary\|p!' => \(my $primary),
				73	'pretty\|y' => \(my $pretty),
				74	'jobs\|j=i' => \(my $jobs = 0),
				75	'help\|h' => sub {
				76	pod2usage(
				77	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				78	-verbose => 99,
				79	-msg => $VERSION_MSG,
				80	);
				81	},
				82	'version\|v' => sub {
				83	pod2usage(
				84	-verbose => 0,
				85	-msg => $VERSION_MSG
				86	)
				87	}
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	88	);
				89
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	90	my %ERROR_HASH = (
				91	-sections => 'NAME\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				92	-verbose => 99,
				93	-msg => $VERSION_MSG,
				94	-exit => 1
				95	);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	96
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	97	# Input has to be defined
				98	pod2usage(%ERROR_HASH) unless $input;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	99
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	100
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	101	# Initialize log4perl object
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	102	Log::Log4perl->init({
				103	'log4perl.rootLogger' => uc($log_level) . ', STDERR',
				104	'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
				105	'log4perl.appender.STDERR.layout' => 'PatternLayout',
				106	'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
				107	});
				108
				109	my $log = Log::Log4perl->get_logger('main');
				110
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	111
				112	# Get file name based on path information
				113	sub get_file_name ($) {
				114	my $file = shift;
				115	$file =~ s/^?\/?$input//;
				116	$file =~ tr/\//-/;
				117	$file =~ s{^-+}{};
				118	return $file;
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	119	};
				120
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	121
				122	# Write file
				123	sub write_file {
				124	my $anno = shift;
				125	my $file = get_file_name $anno;
				126
				127	# TODO: This should be done directly with a data structure! KorAP::XML::Wrap
				128
				129	my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
				130	$anno . ' -o ' . $output . '/' . $file . '.json';
				131	$call .= '.gz -z' if $gzip;
				132	$call .= ' -m' if $text;
				133	$call .= ' -w' if $overwrite;
				134	$call .= ' -t ' . $token_base if $token_base;
				135	$call .= ' -l ' . $log_level if $log_level;
				136	$call .= ' --no-primary ' if $primary;
				137	$call .= ' -y ' . $pretty if $pretty;
				138	$call .= ' -a ' . $_ foreach @allow;
				139	$call .= ' -s ' . $_ foreach @skip;
				140	system($call);
				141	return "$file";
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	142	};
				143
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	144
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	145	# Process a single file
				146	unless ($cmd) {
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	147
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	148	# Can't print gzip to STDOUT
				149	pod2usage(%ERROR_HASH) if $gzip && !$output;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	150
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	151	my %skip;
				152	$skip{lc($_)} = 1 foreach @skip;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	153
				154
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	155	# Ignore processing
				156	if (!$overwrite && $output && -e $output) {
				157	$log->trace($output . ' already exists');
				158	exit(0);
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	159	};
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	160
				161	BEGIN {
				162	$main::TIME = Benchmark->new;
				163	$main::LAST_STOP = Benchmark->new;
				164	};
				165
				166	sub stop_time {
				167	my $new = Benchmark->new;
				168	$log->trace(
				169	'The code took: '.
				170	timestr(timediff($new, $main::LAST_STOP)) .
				171	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
				172	);
				173	$main::LAST_STOP = $new;
				174	};
				175
				176	# Create and parse new document
				177	$input =~ s{([^/])$}{$1/};
				178	my $doc = KorAP::XML::Krill->new( path => $input );
				179
				180	unless ($doc->parse) {
				181	$log->warn($output . " can't be processed - no document data");
				182	exit(0);
				183	};
				184
				185	my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
				186	if ($token_base) {
				187	($token_base_foundry, $token_base_layer) = split /#/, $token_base;
				188	};
				189
				190	# Get tokenization
				191	my $tokens = KorAP::XML::Tokenizer->new(
				192	path => $doc->path,
				193	doc => $doc,
				194	foundry => $token_base_foundry,
				195	layer => $token_base_layer,
				196	name => 'tokens'
				197	);
				198
				199	# Unable to process base tokenization
				200	unless ($tokens->parse) {
				201	$log->error($output . " can't be processed - no base tokenization");
				202	exit(0);
				203	};
				204
				205	my @layers;
				206	push(@layers, ['Base', 'Sentences']);
				207	push(@layers, ['Base', 'Paragraphs']);
				208
				209	# Connexor
				210	push(@layers, ['Connexor', 'Morpho']);
				211	push(@layers, ['Connexor', 'Syntax']);
				212	push(@layers, ['Connexor', 'Phrase']);
				213	push(@layers, ['Connexor', 'Sentences']);
				214
				215	# CoreNLP
				216	push(@layers, ['CoreNLP', 'NamedEntities']);
				217	push(@layers, ['CoreNLP', 'Sentences']);
				218	push(@layers, ['CoreNLP', 'Morpho']);
				219	push(@layers, ['CoreNLP', 'Constituency']);
				220
				221	# DeReKo
				222	push(@layers, ['DeReKo', 'Structure']);
				223
				224	# Glemm
				225	push(@layers, ['Glemm', 'Morpho']);
				226
				227	# Malt
				228	# push(@layers, ['Malt', 'Dependency']);
				229
				230	# Mate
				231	push(@layers, ['Mate', 'Morpho']);
				232	push(@layers, ['Mate', 'Dependency']);
				233
				234	# OpenNLP
				235	push(@layers, ['OpenNLP', 'Morpho']);
				236	push(@layers, ['OpenNLP', 'Sentences']);
				237
				238	# Schreibgebrauch
				239	push(@layers, ['Sgbr', 'Lemma']);
				240	push(@layers, ['Sgbr', 'Morpho']);
				241
				242	# TreeTagger
				243	push(@layers, ['TreeTagger', 'Morpho']);
				244	push(@layers, ['TreeTagger', 'Sentences']);
				245
				246	# XIP
				247	push(@layers, ['XIP', 'Morpho']);
				248	push(@layers, ['XIP', 'Constituency']);
				249	push(@layers, ['XIP', 'Sentences']);
				250	push(@layers, ['XIP', 'Dependency']);
				251
				252
				253	if ($skip{'#all'}) {
				254	foreach (@allow) {
				255	$tokens->add(split('#', $_));
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	256	stop_time;
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	257	};
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	258	}
				259	else {
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	260	# Add to index file - respect skipping
				261	foreach my $info (@layers) {
				262	# Skip if Foundry or Foundry#Layer should be skipped
				263	unless ($skip{lc($info->[0])} \|\| $skip{lc($info->[0]) . '#' . lc($info->[1])}) {
				264	$tokens->add(@$info);
				265	stop_time;
				266	};
				267	};
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	268	};
				269
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	270	my $file;
				271
				272	my $print_text = $text ? $tokens->to_string($primary) :
				273	($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
				274
				275	if ($output) {
				276
				277	if ($gzip) {
				278	$file = IO::Compress::Gzip->new($output, Minimal => 1);
				279	}
				280	else {
				281	$file = IO::File->new($output, "w");
				282	};
				283
				284	$file->print($print_text);
				285	$file->close;
				286	}
				287
				288	else {
				289	print $print_text . "\n";
				290	};
				291
				292	stop_time;
Nils Diewald	7364d1f	2013-11-05 19:26:35 +0000	[diff] [blame]	293	}
Nils Diewald	59094f2	2014-11-05 18:20:50 +0000	[diff] [blame]	294
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	295	# Process an archive
				296	elsif ($cmd eq 'archive') {
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	297
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	298	pod2usage(%ERROR_HASH) unless $output;
				299
				300	if ($output && (!-e $output \|\| !-d $output)) {
				301	print "Directory '$output' does not exist.\n\n";
				302	exit(0);
				303	};
				304
				305	# Zero means: everything runs in the parent process
				306	my $pool = Parallel::ForkManager->new($jobs);
				307
				308	my $count = 0; # Texts to process
				309	my $iter = 1; # Current text in process
				310
				311	# Report on fork message
				312	$pool->run_on_finish (
				313	sub {
				314	my ($pid, $code) = shift;
				315	my $data = pop;
				316	print 'Convert ['. ($jobs > 0 ? "$pid:" : '') .
				317	($iter++) . "/$count]" .
				318	($code ? " $code" : '') .
				319	" $$data\n";
				320	}
				321	);
				322
				323	my $t;
				324	print "Reading data ...\n";
				325
				326	# Input is a directory
				327	if (-d $input) {
				328	my $it = Directory::Iterator->new($input);
				329	my @dirs;
				330	my $dir;
				331
				332	while (1) {
				333	if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
				334	push @dirs, $dir;
				335	$it->prune;
				336	};
				337	last unless $it->next;
				338	};
				339
				340	print "Start processing ...\n";
				341	$t = Benchmark->new;
				342	$count = scalar @dirs;
				343
				344	DIRECTORY_LOOP:
				345	for (my $i = 0; $i < $count; $i++) {
				346
				347	unless ($overwrite) {
				348	my $filename = catfile(
				349	$output,
				350	get_file_name($dirs[$i]) . '.json' . ($gzip ? '.gz' : '')
				351	);
				352
				353	if (-e $filename) {
				354	$iter++;
				355	print "Skip $filename\n";
				356	next;
				357	};
				358	};
				359
				360	# Get the next fork
				361	my $pid = $pool->start and next DIRECTORY_LOOP;
				362	my $msg;
				363
				364	$msg = write_file($dirs[$i]);
				365	$pool->finish(0, \$msg);
				366	};
				367	}
				368
				369	# Input is a file
				370	elsif (-f($input) && (my $archive = KorAP::XML::Archive->new($input))) {
				371	unless ($archive->test_unzip) {
				372	print "Unzip is not installed or incompatible.\n\n";
				373	exit(1);
				374	};
				375
				376	unless ($archive->test) {
				377	print "Zip archive not compatible.\n\n";
				378	exit(1);
				379	};
				380
				381	print "Start processing ...\n";
				382	$t = Benchmark->new;
				383	my @dirs = $archive->list_texts;
				384	$count = scalar @dirs;
				385
				386	ARCHIVE_LOOP:
				387	for (my $i = 0; $i < $count; $i++) {
				388
				389	# Split path information
				390	my ($prefix, $corpus, $doc, $text) = $archive->split_path($dirs[$i]);
				391
				392	unless ($overwrite) {
				393	my $filename = catfile(
				394	$output,
				395	get_file_name(catdir($doc, $text)) . '.json' . ($gzip ? '.gz' : '')
				396	);
				397
				398	if (-e $filename) {
				399	$iter++;
				400	print "Skip $filename\n";
				401	next;
				402	};
				403	};
				404
				405	# Get the next fork
				406	my $pid = $pool->start and next ARCHIVE_LOOP;
				407
				408	# Create temporary file
				409	my $temp = File::Temp->newdir;
				410
				411	my $msg;
				412
				413	# Extract from archive
				414	if ($archive->extract($dirs[$i], $temp)) {
				415
				416	# Create corpus directory
				417	$input = catdir("$temp", $corpus);
				418
				419	# Temporary directory
				420	my $dir = catdir($input, $doc, $text);
				421
				422	# Write file
				423	$msg = write_file($dir);
				424
				425	$temp = undef;
				426	$pool->finish(0, \$msg);
				427	}
				428	else {
				429
				430	$temp = undef;
				431	$msg = "Unable to extract " . $dirs[$i] . "\n";
				432	$pool->finish(1, \$msg);
				433	};
				434	};
				435	}
				436
				437	else {
				438	print "Input is neither a directory nor an archive.\n\n";
				439	};
				440
				441	$pool->wait_all_children;
				442
				443	print "Done.\n";
				444	print timestr(timediff(Benchmark->new, $t))."\n\n";
				445	}
				446
				447	# Unknown command
				448	else {
				449	warn "Unknown command '$cmd'.\n\n";
				450	pod2usage(%ERROR_HASH);
				451	}
Nils Diewald	2db9ad0	2013-10-29 19:26:43 +0000	[diff] [blame]	452
				453	__END__
Akron	941c1a6	2016-02-23 17:41:41 +0100	[diff] [blame^]	454
				455	=pod
				456
				457	=encoding utf8
				458
				459	=head1 NAME
				460
				461	korapxml2krill - Merge KorapXML data and create Krill friendly documents
				462
				463
				464	=head1 SYNOPSIS
				465
				466	$ korapxml2krill [archive] -z --input <directory> --output <filename>
				467
				468
				469	=head1 DESCRIPTION
				470
				471	L<KorAP::XML::Krill> is a library to convert KorAP-XML documents to files
				472	compatible with the L<Krill\|https://github.com/KorAP/Krill> indexer.
				473
				474
				475	=head1 INSTALLATION
				476
				477	The preferred way to install L<KorAP::XML::Krill> is to use L<cpanm\|App::cpanminus>.
				478
				479	$ cpanm https://github.com/KorAP/KorAP-XML-Krill
				480
				481	In case everything went well, the C<korapxml2krill> command line tool will
				482	be available.
				483
				484
				485	=head1 ARGUMENTS
				486
				487	=over 2
				488
				489	=item B<archive>
				490
				491	Process an archive as a Zip-File or a folder of KorAP-XML documents.
				492
				493	=back
				494
				495
				496	=head1 OPTIONS
				497
				498	=over 2
				499
				500	=item B<--input\|-i> <directory\|file>
				501
				502	Directory or archive file of documents to index.
				503
				504	=item B<--output\|-o> <directory\|file>
				505
				506	Output folder for archive processing or
				507	document name for single output (optional),
				508	writes to <STDOUT> by default.
				509
				510	=item B<--overwrite\|-w>
				511
				512	Overwrite files that already exist.
				513
				514	=item B<--token\|-t> <foundry>[#<file>]
				515
				516	Define the default tokenization by specifying
				517	the name of the foundry and optionally the name
				518	of the layer-file. Defaults to OpenNLP#tokens.
				519
				520	=item B<--skip\|-s> <foundry>[#<layer>]
				521
				522	Skip specific foundries by specifying the name
				523	or specific layers by defining the name
				524	with a # in front of the foundry,
				525	e.g. Mate#Morpho. Alternatively you can skip #ALL.
				526	Can be set multiple times.
				527
				528	=item B<--allow\|-a> <foundry>#<layer>
				529
				530	Allow specific foundries and layers by defining them
				531	combining the foundry name with a # and the layer name.
				532
				533	=item B<--primary\|-p>
				534
				535	Output primary data or not. Defaults to true.
				536	Can be flagged using --no-primary as well.
				537
				538	=item B<--jobs\|-j>
				539
				540	Define the number of concurrent jobs in seperated forks
				541	for archive processing, defaults to 0. This is B<EXPERIMENTAL>!
				542
				543	=item B<--human\|-m>
				544
				545	Represent the data human friendly, while the output defaults to JSON.
				546
				547	=item B<--pretty\|-y>
				548
				549	Pretty print JSON output.
				550
				551	=item B<--gzip\|-z>
				552
				553	Compress the output (expects a defined output file in single processing).
				554
				555	=item B<--log\|-l>
				556
				557	The L<Log4perl> log level, defaults to C<ERROR>.
				558
				559	=item B<--help\|-h>
				560
				561	Print this document.
				562
				563	=item B<--version\|-v>
				564
				565	Print version information.
				566
				567	=back
				568
				569	=head1 AVAILABILITY
				570
				571	https://github.com/KorAP/KorAP-XML-Krill
				572
				573
				574	=head1 COPYRIGHT AND LICENSE
				575
				576	Copyright (C) 2015-2016, L<IDS Mannheim\|http://www.ids-mannheim.de/>
				577	Author: L<Nils Diewald\|http://nils-diewald.de/>
				578
				579	L<KorAP::XML::Krill> is developed as part of the L<KorAP\|http://korap.ids-mannheim.de/>
				580	Corpus Analysis Platform at the
				581	L<Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				582	member of the
				583	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/en/about-us/leibniz-competition/projekte-2011/2011-funding-line-2/>.
				584
				585	This program is free software published under the
				586	L<BSD-2 License\|https://raw.githubusercontent.com/KorAP/KorAP-XML-Krill/master/LICENSE>.
				587
				588	=cut