Add metadata caching
Change-Id: Ic3fd0d353c66a8ae3732de7f6d342ed159f80b16
diff --git a/.gitignore b/.gitignore
index 8518f50..2aee81a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,5 +22,7 @@
*.log
*.db
*.old
+*.tmp
+*.cache
.*
!.gitignore
diff --git a/Changes b/Changes
index 96c7f55..51dbb27 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.16 2016-03-18
+ - Added caching mechanism for
+ metadata.
+
0.15 2016-03-17
- Modularized metadata handling.
- Simplified metadata handling.
diff --git a/Makefile.PL b/Makefile.PL
index a2a43a7..8e659d4 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -11,26 +11,27 @@
VERSION_FROM => 'lib/KorAP/XML/Krill.pm',
LICENSE => 'bsd_2',
PREREQ_PM => {
- 'Mojolicious' => 6.11,
- 'Packed::Array' => 0.01,
- 'Log::Log4perl' => 1.42,
- 'JSON::XS' => 3.01,
- 'Set::Scalar' => 1.26,
- 'XML::Fast' => 0.11,
- 'Try::Tiny' => 0.21,
- 'Array::IntSpan' => 2.003,
+ 'Mojolicious' => 6.11,
+ 'Packed::Array' => 0.01,
+ 'Log::Log4perl' => 1.42,
+ 'JSON::XS' => 3.01,
+ 'Set::Scalar' => 1.26,
+ 'XML::Fast' => 0.11,
+ 'Try::Tiny' => 0.21,
+ 'Array::IntSpan' => 2.003,
'List::MoreUtils' => 0.33,
'Parallel::ForkManager' => 1.17,
'IO::Dir::Recursive' => 0.03,
- 'File::Temp' => 0,
+ 'File::Temp' => 0,
'Directory::Iterator' => 0,
- 'Benchmark' => 0,
- 'Carp' => 0,
- 'strict' => 0,
- 'warnings' => 0,
- 'utf8' => 0,
- 'bytes' => 0,
- 'Pod::Usage' => 0
+ 'Benchmark' => 0,
+ 'Carp' => 0,
+ 'strict' => 0,
+ 'warnings' => 0,
+ 'utf8' => 0,
+ 'bytes' => 0,
+ 'Pod::Usage' => 0,
+ 'Cache::FastMmap' => 1.40
},
MIN_PERL_VERSION => '5.014',
test => {
diff --git a/Readme.pod b/Readme.pod
index 62662df..58fd306 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -95,7 +95,7 @@
Define the number of concurrent jobs in seperated forks
for archive processing.
-Defaults to C<0>.
+Defaults to C<0> (everything runs in a single process).
This is I<experimental>.
=item B<--meta|-m>
@@ -114,6 +114,27 @@
Compress the output.
Expects a defined C<output> file in single processing.
+=item B<--cache|-c>
+
+File to mmap a cache (using L<Cache::FastMmap>).
+Defaults to C<korapxml2krill.cache> in the calling directory.
+
+=item B<--cache-size|-cs>
+
+Size of the cache. Defaults to C<50m>.
+
+=item B<--cache-init|-ci>
+
+Initialize cache file.
+Can be flagged using C<--no-cache-init> as well.
+Defaults to C<true>.
+
+=item B<--cache-delete|-cd>
+
+Delete cache file after processing.
+Can be flagged using C<--no-cache-delete> as well.
+Defaults to C<true>.
+
=item B<--sigle|-sg>
Extract the given text sigles.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 590e15d..29b8331 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -10,6 +10,7 @@
use KorAP::XML::Tokenizer;
use Log::Log4perl;
use KorAP::XML::Log;
+use Cache::FastMmap;
use Mojo::DOM;
use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
@@ -18,11 +19,12 @@
# Due to the kind of processing, processed metadata may be stored in
# a multiprocess cache instead.
-our $VERSION = '0.15';
+our $VERSION = '0.16';
has 'path';
has [qw/text_sigle doc_sigle corpus_sigle/];
has 'meta_type' => 'I5';
+has 'cache';
has log => sub {
if(Log::Log4perl->initialized()) {
@@ -129,7 +131,8 @@
log => $self->log,
corpus_sigle => $self->corpus_sigle,
doc_sigle => $self->doc_sigle,
- text_sigle => $self->text_sigle
+ text_sigle => $self->text_sigle,
+ cache => $self->cache
);
# Associate meta object
@@ -147,6 +150,9 @@
# Get corpus, doc and text meta data
my $type = shift(@type);
+ # Check for cache
+ next if $meta->is_cached($type);
+
next unless -e $_;
# Slurp data and probably decode
@@ -159,6 +165,7 @@
# Parse object based on DOM
$meta->parse($dom, $type);
+ $meta->do_cache($type);
};
return $self;
@@ -239,10 +246,7 @@
# Get meta object
my $meta = $self->meta;
- foreach (keys %$meta) {
-
- # Ignore private keys
- next if index($_, '_') == 0;
+ foreach ($meta->keys) {
my $v = $meta->{$_};
if (ref $v) {
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
index a92ebd5..a665d71 100644
--- a/lib/KorAP/XML/Meta/Base.pm
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -33,11 +33,15 @@
$_[0]->{_text_sigle};
};
+sub cache {
+ $_[0]->{_cache};
+}
+
sub new {
my $class = shift;
my %hash = @_;
my $copy = {};
- foreach (qw/log corpus_sigle doc_sigle text_sigle/) {
+ foreach (qw/log cache corpus_sigle doc_sigle text_sigle/) {
$copy->{'_' . $_} = $hash{$_};
};
@@ -49,5 +53,60 @@
return join(' ', @{$self->{$_[0]} // []});
};
+# Check if cached
+sub is_cached {
+ my ($self, $type) = @_;
+
+ return if $type eq 'text';
+ return unless $self->cache;
+
+ my $value;
+ my $cache = $self->cache;
+ if ($type eq 'corpus') {
+ $value = $cache->get($self->corpus_sigle);
+ }
+ elsif ($type eq 'doc') {
+ $value = $cache->get($self->doc_sigle);
+ };
+
+ if ($value) {
+ foreach (grep {index($_, '_') != 0 } keys %$value) {
+ $self->{$_} = $value->{$_};
+ };
+ return 1;
+ };
+
+ return;
+};
+
+sub keys {
+ my $self = shift;
+ return grep {index($_, '_') != 0 } keys %$self;
+};
+
+sub do_cache {
+ my ($self, $type) = @_;
+
+ return if $type eq 'text';
+ return unless $self->cache;
+
+ my %value;
+ foreach ($self->keys) {
+ $value{$_} = $self->{$_};
+ };
+
+ my $cache = $self->cache;
+
+ if ($type eq 'corpus') {
+ $cache->set($self->corpus_sigle, \%value);
+ return 1;
+ }
+ elsif ($type eq 'doc') {
+ $cache->set($self->doc_sigle, \%value);
+ return 1;
+ };
+
+ return 0;
+};
1;
diff --git a/script/korapxml2krill b/script/korapxml2krill
index c5db742..99dd333 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -9,6 +9,7 @@
use IO::Compress::Gzip qw/$GzipError/;
use Log::Log4perl;
use Pod::Usage;
+use Cache::FastMmap;
use Directory::Iterator;
use KorAP::XML::Krill;
use KorAP::XML::Archive;
@@ -49,6 +50,9 @@
#
# 2016/03/17
# - Added meta switch
+#
+# 2016/03/18
+# - Added meta data caching
# ----------------------------------------------------------
our $LAST_CHANGE = '2016/03/17';
@@ -73,12 +77,15 @@
'input|i=s' => \(my $input),
'output|o=s' => \(my $output),
'overwrite|w' => \(my $overwrite),
-# 'human|m' => \(my $text),
'meta|m=s' => \(my $meta),
'token|t=s' => \(my $token_base),
'gzip|z' => \(my $gzip),
'skip|s=s' => \@skip,
'sigle|sg=s' => \@sigle,
+ 'cache|c=s' => \(my $cache_file = 'korapxml2krill.cache'),
+ 'cache-size|cs=s' => \(my $cache_size = '50m'),
+ 'cache-delete|cd!' => \(my $cache_delete = 1),
+ 'cache-init|ci!' => \(my $cache_init = 1),
'log|l=s' => \(my $log_level = 'ERROR'),
'anno|a=s' => \(my @anno),
'primary|p!' => \(my $primary),
@@ -141,11 +148,14 @@
my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
$anno . ' -o ' . $output . '/' . $file . '.json';
$call .= '.gz -z' if $gzip;
-# $call .= ' -m' if $text;
$call .= ' -m ' . $meta if $meta;
$call .= ' -w' if $overwrite;
$call .= ' -t ' . $token_base if $token_base;
$call .= ' -l ' . $log_level if $log_level;
+ $call .= ' -c ' . $cache_file;
+ $call .= ' -cs ' . $cache_size;
+ $call .= ' --no-cache-delete'; # Don't delete the cache
+ $call .= ' --no-cache-init'; # Don't initialize the cache
$call .= ' --no-primary ' if $primary;
$call .= ' -y ' . $pretty if $pretty;
$call .= ' -a ' . $_ foreach @anno;
@@ -167,7 +177,6 @@
my %skip;
$skip{lc($_)} = 1 foreach @skip;
-
# Ignore processing
if (!$overwrite && $output && -e $output) {
$log->trace($output . ' already exists');
@@ -193,7 +202,12 @@
$input =~ s{([^/])$}{$1/};
my $doc = KorAP::XML::Krill->new(
path => $input,
- meta_type => ($meta // 'I5')
+ meta_type => ($meta // 'I5'),
+ cache => Cache::FastMmap->new(
+ share_file => $cache_file,
+ cache_size => $cache_size,
+ init_file => $cache_init
+ )
);
unless ($doc->parse) {
@@ -306,6 +320,9 @@
print $print_text . "\n";
};
+ # Delete cache file
+ unlink($cache_file) if $cache_delete;
+
stop_time;
}
@@ -375,6 +392,15 @@
my $t;
print "Reading data ...\n";
+ unless (Cache::FastMmap->new(
+ share_file => $cache_file,
+ cache_size => $cache_size,
+ init_file => $cache_init
+ )) {
+ print "Unable to intialize cache '$cache_file'\n\n";
+ exit(1);
+ };
+
# Input is a directory
if (-d $input) {
my $it = Directory::Iterator->new($input);
@@ -492,6 +518,9 @@
$pool->wait_all_children;
+ # Delete cache file
+ unlink($cache_file) if $cache_delete;
+
print "Done.\n";
print timestr(timediff(Benchmark->new, $t))."\n\n";
}
@@ -601,7 +630,7 @@
Define the number of concurrent jobs in seperated forks
for archive processing.
-Defaults to C<0>.
+Defaults to C<0> (everything runs in a single process).
This is I<experimental>.
=item B<--meta|-m>
@@ -620,6 +649,27 @@
Compress the output.
Expects a defined C<output> file in single processing.
+=item B<--cache|-c>
+
+File to mmap a cache (using L<Cache::FastMmap>).
+Defaults to C<korapxml2krill.cache> in the calling directory.
+
+=item B<--cache-size|-cs>
+
+Size of the cache. Defaults to C<50m>.
+
+=item B<--cache-init|-ci>
+
+Initialize cache file.
+Can be flagged using C<--no-cache-init> as well.
+Defaults to C<true>.
+
+=item B<--cache-delete|-cd>
+
+Delete cache file after processing.
+Can be flagged using C<--no-cache-delete> as well.
+Defaults to C<true>.
+
=item B<--sigle|-sg>
Extract the given text sigles.