Cleanup main script by simplifying configuration file handling
Change-Id: Ife872a344cd652936a1198dc0f4377f34970903b
diff --git a/script/korapxml2krill b/script/korapxml2krill
index 7d246ec..cb1863c 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -1,6 +1,7 @@
#!/usr/bin/env perl
use strict;
use warnings;
+use v5.10;
use FindBin;
BEGIN { unshift @INC, "$FindBin::Bin/../lib" };
use File::Spec::Functions qw/catfile catdir/;
@@ -18,7 +19,6 @@
use KorAP::XML::Batch::File;
use Config::Simple;
use Parallel::ForkManager;
-use v5.10;
use Sys::Info;
use Sys::Info::Constants qw( :device_cpu );
use File::Glob ':bsd_glob';
@@ -154,16 +154,13 @@
# - Added support for Redewiedergabe-Korpus morphology
# ----------------------------------------------------------
-our $LAST_CHANGE = '2020/04/23';
+our $LAST_CHANGE = '2020/08/07';
our $LOCAL = $FindBin::Bin;
our $KORAL_VERSION = 0.03;
our $VERSION_MSG = <<"VERSION";
Version $KorAP::XML::Krill::VERSION - diewald\@ids-mannheim.de - $LAST_CHANGE
VERSION
-# Prototypes
-sub get_file_name($$);
-
# Parse comand
my $cmd;
our @ARGV;
@@ -173,38 +170,40 @@
my @keep_argv = @ARGV;
my (@skip, @sigle, @anno, @input);
-my $text;
+
+# Configuration hash
+my %cfg = ();
# Parse options from the command line
GetOptions(
'input|i=s' => \@input,
- 'input-base|ib=s' => \(my $input_base),
- 'output|o=s' => \(my $output),
- 'overwrite|w' => \(my $overwrite),
- 'meta|m=s' => \(my $meta),
- 'token|t=s' => \(my $token_base),
- 'base-sentences|bs=s' => \(my $base_sentences),
- 'base-paragraphs|bp=s' => \(my $base_paragraphs),
- 'base-pagebreaks|bpb=s' => \(my $base_pagebreaks),
- 'gzip|z' => \(my $gzip),
- 'temporary-extract|te=s' => \(my $extract_dir),
+ 'input-base|ib=s' => \($cfg{input_base}),
+ 'output|o=s' => \($cfg{output}),
+ 'overwrite|w' => \($cfg{overwrite}),
+ 'meta|m=s' => \($cfg{meta}),
+ 'token|t=s' => \($cfg{token}),
+ 'base-sentences|bs=s' => \($cfg{base_sentences}),
+ 'base-paragraphs|bp=s' => \($cfg{base_paragraphs}),
+ 'base-pagebreaks|bpb=s' => \($cfg{base_pagebreaks}),
+ 'gzip|z' => \($cfg{gzip}),
+ 'temporary-extract|te=s' => \($cfg{extract_dir}),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
- 'cache|c=s' => \(my $cache_file),
+ 'cache|c=s' => \($cfg{cache_file}),
'config|cfg=s' => \(my $cfg_file),
- 'log|l=s' => \(my $log_level),
+ 'log|l=s' => \($cfg{log}),
'anno|a=s' => \@anno,
'primary|p!' => \(my $primary),
'pretty|y' => \(my $pretty),
- 'jobs|j=i' => \(my $jobs),
- 'koral|k=f' => \(my $koral),
- 'to-tar' => \(my $to_tar),
- 'non-word-tokens|nwt' => \(my $non_word_tokens),
- 'non-verbal-tokens|nvt' => \(my $non_verbal_tokens),
- 'sequential-extraction|se' => \(my $sequential_extraction),
- 'cache-size|cs=s' => \(my $cache_size),
- 'cache-delete|cd!' => \(my $cache_delete),
- 'cache-init|ci!' => \(my $cache_init),
+ 'jobs|j=i' => \($cfg{jobs}),
+ 'koral|k=f' => \($cfg{koral}),
+ 'to-tar' => \($cfg{to_tar}),
+ 'non-word-tokens|nwt' => \($cfg{non_word_tokens}),
+ 'non-verbal-tokens|nvt' => \($cfg{non_verbal_tokens}),
+ 'sequential-extraction|se' => \($cfg{sequential_extraction}),
+ 'cache-size|cs=s' => \($cfg{cache_size}),
+ 'cache-delete|cd!' => \($cfg{cache_delete}),
+ 'cache-init|ci!' => \($cfg{cache_init}),
'help|h' => sub {
pod2usage(
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -223,115 +222,23 @@
);
-# Load from configuration
+# Load from configuration and fill non-given data
if ($cfg_file && -e $cfg_file) {
my %config;
+ print "Reading config from $cfg_file\n";
+
Config::Simple->import_from($cfg_file, \%config);
- # Overwrite
- if (!defined($overwrite) && defined $config{overwrite}) {
- $overwrite = $config{overwrite};
- };
-
- # Gzip
- if (!defined($gzip) && defined $config{gzip}) {
- $gzip = $config{gzip};
- };
-
- # Jobs
- if (!defined($jobs) && defined $config{jobs}) {
- $jobs = $config{jobs};
- };
-
- # Koral version
- if (!defined($koral) && defined $config{koral}) {
- $koral = $config{koral};
- };
-
- # Input root base directory
- if (!defined($input_base) && defined $config{'input-base'}) {
- $input_base = $config{'input-base'};
- };
-
- # temporary-extract
- if (!defined($extract_dir) && defined $config{'temporary-extract'}) {
- $extract_dir = $config{'temporary-extract'};
- };
-
- # Token base
- if (!defined($token_base) && defined $config{token}) {
- $token_base = $config{token};
- };
-
- # Non-word tokenization
- if (!defined($non_word_tokens) && defined $config{'non-word-tokens'}) {
- $non_word_tokens = $config{'non-word-tokens'};
- };
-
- # Non-verbal tokenization
- if (!defined($non_verbal_tokens) && defined $config{'non-verbal-tokens'}) {
- $non_verbal_tokens = $config{'non-verbal-tokens'};
- };
-
- # Cache file
- if (!defined($cache_file) && defined $config{cache}) {
- $cache_file = $config{cache};
- };
-
- # Cache size
- if (!defined($cache_size) && defined $config{'cache-size'}) {
- $cache_size = $config{'cache-size'};
- };
-
- # Cache delete
- if (!defined($cache_delete) && defined $config{'cache-delete'}) {
- $cache_delete = $config{'cache-delete'} ;
- };
-
- # Cache init
- if (!(defined $cache_init) && defined $config{'cache-init'}) {
- $cache_init = $config{'cache-init'} ;
- };
-
- # Jobs for extraction
- if (!(defined $sequential_extraction) && defined $config{'sequential-extraction'}) {
- $sequential_extraction = $config{'sequential-extraction'} ;
- };
-
- # Meta
- if (!(defined $meta) && defined $config{'meta'}) {
- $meta = $config{'meta'} ;
- };
-
- # Output
- if (!(defined $output) && defined $config{'output'}) {
- $output = $config{'output'} ;
- };
-
- # Base-sentences
- if (!(defined $base_sentences) && defined $config{'base-sentences'}) {
- $base_sentences = $config{'base-sentences'} ;
- };
-
- # Base-paragraphs
- if (!(defined $base_paragraphs) && defined $config{'base-paragraphs'}) {
- $base_paragraphs = $config{'base-paragraphs'} ;
- };
-
- # Base-pagebreaks
- if (!(defined $base_pagebreaks) && defined $config{'base-pagebreaks'}) {
- $base_pagebreaks = $config{'base-pagebreaks'} ;
- };
-
- # Write to tar
- if (!(defined $to_tar) && defined $config{'to-tar'}) {
- $to_tar = $config{'to-tar'} ;
- };
-
- # Log
- if (!(defined $log_level) && defined $config{'log'}) {
- $log_level = $config{'log'} ;
+ foreach (qw!output cache-size input-base token overwrite
+ meta base-sentences base-paragraphs base-pagebreaks
+ gzip to-tar log cache non-word-tokens
+ non-verbal-tokens sequential-extraction cache-init
+ koral extract-dir jobs!) {
+ my $underlined = $_ =~ tr/-/_/r;
+ if (!defined($cfg{$underlined}) && defined $config{$_}) {
+ $cfg{$underlined} = $config{$_};
+ };
};
# Skip
@@ -350,31 +257,36 @@
};
};
+# Init variables and set default values
+my $output = $cfg{output};
+my $input_base = $cfg{input_base};
+my $gzip = $cfg{gzip};
+my $to_tar = $cfg{to_tar};
+my $extract_dir = $cfg{extract_dir};
+my $token_base = $cfg{token} // 'OpenNLP#tokens';
+my $cache_file = $cfg{cache} // 'korapxml2krill.cache';
+my $jobs = $cfg{jobs} // 0;
+my $cache_delete = $cfg{cache_delete} // 1;
+my $base_sentences = lc($cfg{base_sentences} // '');
+my $base_paragraphs = lc($cfg{base_paragraphs} // '');
+my $base_pagebreaks = lc($cfg{base_pagebreaks} // '');
+my $sequential_extraction = $cfg{sequential_extraction} // 0;
-# Set default token base
-$token_base //= 'OpenNLP#tokens';
-$cache_file //= 'korapxml2krill.cache';
-$cache_size //= '50m';
-$jobs //= 0;
-$koral //= $KORAL_VERSION;
-$cache_delete //= 1;
-$cache_init //= 1;
-$sequential_extraction //= 0;
-$log_level //= 'ERROR';
-$base_sentences //= '';
-$base_paragraphs //= '';
-$base_pagebreaks //= '';
-$non_word_tokens //= 0;
-$non_verbal_tokens //= 0;
+# Get tokenization basis
+my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
-$base_sentences = lc $base_sentences;
-$base_paragraphs = lc $base_paragraphs;
-$base_pagebreaks = lc $base_pagebreaks;
+# Remove file extension
+$token_base_layer =~ s/\.xml$//i;
+# Convert sigle to path construct
+s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
+
+my %skip;
+$skip{lc($_)} = 1 foreach @skip;
# Initialize log4perl object
Log::Log4perl->init({
- 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
+ 'log4perl.rootLogger' => uc($cfg{log} // 'ERROR') . ', STDERR',
'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
'log4perl.appender.STDERR.layout' => 'PatternLayout',
'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
@@ -382,9 +294,10 @@
my $log = Log::Log4perl->get_logger('main');
-
-print "Reading config from $cfg_file\n" if $cfg_file;
-
+if ($cmd && $output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
+ $log->error("Directory '$output' does not exist.");
+ exit 1;
+};
my %ERROR_HASH = (
-sections => 'NAME|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -401,8 +314,9 @@
pod2usage(%ERROR_HASH) if $gzip && !$output;
+# Auto adjust jobs
if ($jobs eq '-1') {
- state $cores = Sys::Info->new->device('CPU')->count;
+ my $cores = Sys::Info->new->device('CPU')->count;
$jobs = ceil(5 * $cores);
$log->info("Run using $jobs jobs on $cores cores");
};
@@ -463,9 +377,6 @@
exit;
};
-my %skip;
-$skip{lc($_)} = 1 foreach @skip;
-
my @layers;
push(@layers, ['Base', 'Sentences']) unless $base_sentences;
push(@layers, ['Base', 'Paragraphs']) unless $base_paragraphs;
@@ -584,47 +495,31 @@
};
};
-# Get tokenization basis
-my ($token_base_foundry, $token_base_layer) = split(/#/, $token_base) if $token_base;
-
-# Remove file extension
-$token_base_layer =~ s/\.xml$//i;
# TODO: This should not be initialized for batch
my $cache = Cache::FastMmap->new(
share_file => $cache_file,
- cache_size => $cache_size,
- init_file => $cache_init
+ cache_size => ($cfg{cache_size} // '50m'),
+ init_file => ($cfg{cache_init} // 1)
);
# Create batch object
my $batch_file = KorAP::XML::Batch::File->new(
cache => $cache,
- meta_type => $meta,
- overwrite => $overwrite,
+ meta_type => $cfg{meta},
+ overwrite => $cfg{overwrite},
foundry => $token_base_foundry,
layer => $token_base_layer,
gzip => $gzip,
log => $log,
- koral => $koral,
+ koral => ($cfg{koral} // $KORAL_VERSION),
primary => $primary,
pretty => $pretty,
anno => \@filtered_anno,
- non_word_tokens => $non_word_tokens,
- non_verbal_tokens => $non_verbal_tokens
+ non_word_tokens => ($cfg{non_word_tokens} // 0),
+ non_verbal_tokens => ($cfg{non_verbal_tokens} // 0)
);
-# Convert sigle to path construct
-s!^\s*([^_]+?)_([^\.]+?)\.(.+?)\s*$!$1/$2/$3! foreach @sigle;
-
-if ($cmd) {
- if ($output && (!defined($to_tar)) && (!-e $output || !-d $output)) {
- $log->error("Directory '$output' does not exist.");
- exit 1;
- };
-};
-
-
# Glob and prefix files
if (@input) {
@@ -1076,11 +971,10 @@
};
-
# Cleanup temporary extraction directory
if ($extract_dir) {
my $objects = remove_tree($extract_dir, { safe => 1 });
- print "Removed directory $extract_dir with $objects objects.\n";
+ $log->info("Removed directory $extract_dir with $objects objects");
};