Add metadata caching

Change-Id: Ic3fd0d353c66a8ae3732de7f6d342ed159f80b16
diff --git a/.gitignore b/.gitignore
index 8518f50..2aee81a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,5 +22,7 @@
 *.log
 *.db
 *.old
+*.tmp
+*.cache
 .*
 !.gitignore
diff --git a/Changes b/Changes
index 96c7f55..51dbb27 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,7 @@
+0.16 2016-03-18
+        - Added caching mechanism for
+	  metadata.
+
 0.15 2016-03-17
         - Modularized metadata handling.
         - Simplified metadata handling.
diff --git a/Makefile.PL b/Makefile.PL
index a2a43a7..8e659d4 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -11,26 +11,27 @@
   VERSION_FROM => 'lib/KorAP/XML/Krill.pm',
   LICENSE      => 'bsd_2',
   PREREQ_PM => {
-    'Mojolicious'    => 6.11,
-    'Packed::Array'  => 0.01,
-    'Log::Log4perl'  => 1.42,
-    'JSON::XS'       => 3.01,
-    'Set::Scalar'    => 1.26,
-    'XML::Fast'      => 0.11,
-    'Try::Tiny'      => 0.21,
-    'Array::IntSpan' => 2.003,
+    'Mojolicious'     => 6.11,
+    'Packed::Array'   => 0.01,
+    'Log::Log4perl'   => 1.42,
+    'JSON::XS'        => 3.01,
+    'Set::Scalar'     => 1.26,
+    'XML::Fast'       => 0.11,
+    'Try::Tiny'       => 0.21,
+    'Array::IntSpan'  => 2.003,
     'List::MoreUtils' => 0.33,
     'Parallel::ForkManager' => 1.17,
     'IO::Dir::Recursive' => 0.03,
-    'File::Temp'     => 0,
+    'File::Temp'      => 0,
     'Directory::Iterator' => 0,
-    'Benchmark'      => 0,
-    'Carp'           => 0,
-    'strict'         => 0,
-    'warnings'       => 0,
-    'utf8'           => 0,
-    'bytes'          => 0,
-    'Pod::Usage'     => 0
+    'Benchmark'       => 0,
+    'Carp'            => 0,
+    'strict'          => 0,
+    'warnings'        => 0,
+    'utf8'            => 0,
+    'bytes'           => 0,
+    'Pod::Usage'      => 0,
+    'Cache::FastMmap' => 1.40
   },
   MIN_PERL_VERSION => '5.014',
   test => {
diff --git a/Readme.pod b/Readme.pod
index 62662df..58fd306 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -95,7 +95,7 @@
 
 Define the number of concurrent jobs in seperated forks
 for archive processing.
-Defaults to C<0>.
+Defaults to C<0> (everything runs in a single process).
 This is I<experimental>.
 
 =item B<--meta|-m>
@@ -114,6 +114,27 @@
 Compress the output.
 Expects a defined C<output> file in single processing.
 
+=item B<--cache|-c>
+
+File to mmap a cache (using L<Cache::FastMmap>).
+Defaults to C<korapxml2krill.cache> in the calling directory.
+
+=item B<--cache-size|-cs>
+
+Size of the cache. Defaults to C<50m>.
+
+=item B<--cache-init|-ci>
+
+Initialize cache file.
+Can be flagged using C<--no-cache-init> as well.
+Defaults to C<true>.
+
+=item B<--cache-delete|-cd>
+
+Delete cache file after processing.
+Can be flagged using C<--no-cache-delete> as well.
+Defaults to C<true>.
+
 =item B<--sigle|-sg>
 
 Extract the given text sigles.
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 590e15d..29b8331 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -10,6 +10,7 @@
 use KorAP::XML::Tokenizer;
 use Log::Log4perl;
 use KorAP::XML::Log;
+use Cache::FastMmap;
 use Mojo::DOM;
 use Data::Dumper;
 use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
@@ -18,11 +19,12 @@
 #       Due to the kind of processing, processed metadata may be stored in
 #       a multiprocess cache instead.
 
-our $VERSION = '0.15';
+our $VERSION = '0.16';
 
 has 'path';
 has [qw/text_sigle doc_sigle corpus_sigle/];
 has 'meta_type' => 'I5';
+has 'cache';
 
 has log => sub {
   if(Log::Log4perl->initialized()) {
@@ -129,7 +131,8 @@
       log          => $self->log,
       corpus_sigle => $self->corpus_sigle,
       doc_sigle    => $self->doc_sigle,
-      text_sigle   => $self->text_sigle
+      text_sigle   => $self->text_sigle,
+      cache        => $self->cache
     );
 
     # Associate meta object
@@ -147,6 +150,9 @@
     # Get corpus, doc and text meta data
     my $type = shift(@type);
 
+    # Check for cache
+    next if $meta->is_cached($type);
+
     next unless -e $_;
 
     # Slurp data and probably decode
@@ -159,6 +165,7 @@
 
     # Parse object based on DOM
     $meta->parse($dom, $type);
+    $meta->do_cache($type);
   };
 
   return $self;
@@ -239,10 +246,7 @@
 
   # Get meta object
   my $meta = $self->meta;
-  foreach (keys %$meta) {
-
-    # Ignore private keys
-    next if index($_, '_') == 0;
+  foreach ($meta->keys) {
 
     my $v = $meta->{$_};
     if (ref $v) {
diff --git a/lib/KorAP/XML/Meta/Base.pm b/lib/KorAP/XML/Meta/Base.pm
index a92ebd5..a665d71 100644
--- a/lib/KorAP/XML/Meta/Base.pm
+++ b/lib/KorAP/XML/Meta/Base.pm
@@ -33,11 +33,15 @@
   $_[0]->{_text_sigle};
 };
 
+sub cache {
+  $_[0]->{_cache};
+}
+
 sub new {
   my $class = shift;
   my %hash = @_;
   my $copy = {};
-  foreach (qw/log corpus_sigle doc_sigle text_sigle/) {
+  foreach (qw/log cache corpus_sigle doc_sigle text_sigle/) {
     $copy->{'_' . $_} = $hash{$_};
   };
 
@@ -49,5 +53,60 @@
   return join(' ', @{$self->{$_[0]} // []});
 };
 
+# Check if cached
+sub is_cached {
+  my ($self, $type) = @_;
+
+  return if $type eq 'text';
+  return unless $self->cache;
+
+  my $value;
+  my $cache = $self->cache;
+  if ($type eq 'corpus') {
+    $value = $cache->get($self->corpus_sigle);
+  }
+  elsif ($type eq 'doc') {
+    $value = $cache->get($self->doc_sigle);
+  };
+
+  if ($value) {
+    foreach (grep {index($_, '_') != 0 } keys %$value) {
+      $self->{$_} = $value->{$_};
+    };
+    return 1;
+  };
+
+  return;
+};
+
+sub keys {
+  my $self = shift;
+  return grep {index($_, '_') != 0 } keys %$self;
+};
+
+sub do_cache {
+  my ($self, $type) = @_;
+
+  return if $type eq 'text';
+  return unless $self->cache;
+
+  my %value;
+  foreach ($self->keys) {
+    $value{$_} = $self->{$_};
+  };
+
+  my $cache = $self->cache;
+
+  if ($type eq 'corpus') {
+    $cache->set($self->corpus_sigle, \%value);
+    return 1;
+  }
+  elsif ($type eq 'doc') {
+    $cache->set($self->doc_sigle, \%value);
+    return 1;
+  };
+
+  return 0;
+};
 
 1;
diff --git a/script/korapxml2krill b/script/korapxml2krill
index c5db742..99dd333 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -9,6 +9,7 @@
 use IO::Compress::Gzip qw/$GzipError/;
 use Log::Log4perl;
 use Pod::Usage;
+use Cache::FastMmap;
 use Directory::Iterator;
 use KorAP::XML::Krill;
 use KorAP::XML::Archive;
@@ -49,6 +50,9 @@
 #
 # 2016/03/17
 # - Added meta switch
+#
+# 2016/03/18
+# - Added meta data caching
 # ----------------------------------------------------------
 
 our $LAST_CHANGE = '2016/03/17';
@@ -73,12 +77,15 @@
   'input|i=s'   => \(my $input),
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
-#  'human|m'     => \(my $text),
   'meta|m=s'    => \(my $meta),
   'token|t=s'   => \(my $token_base),
   'gzip|z'      => \(my $gzip),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
+  'cache|c=s'   => \(my $cache_file = 'korapxml2krill.cache'),
+  'cache-size|cs=s'   => \(my $cache_size = '50m'),
+  'cache-delete|cd!' => \(my $cache_delete = 1),
+  'cache-init|ci!'   => \(my $cache_init = 1),
   'log|l=s'     => \(my $log_level = 'ERROR'),
   'anno|a=s'    => \(my @anno),
   'primary|p!'  => \(my $primary),
@@ -141,11 +148,14 @@
   my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
     $anno . ' -o ' . $output . '/' . $file . '.json';
   $call .= '.gz -z' if $gzip;
-#  $call .= ' -m' if $text;
   $call .= ' -m ' . $meta if $meta;
   $call .= ' -w' if $overwrite;
   $call .= ' -t ' . $token_base if $token_base;
   $call .= ' -l ' . $log_level if $log_level;
+  $call .= ' -c ' . $cache_file;
+  $call .= ' -cs ' . $cache_size;
+  $call .= ' --no-cache-delete'; # Don't delete the cache
+  $call .= ' --no-cache-init'; # Don't initialize the cache
   $call .= ' --no-primary ' if $primary;
   $call .= ' -y ' . $pretty if $pretty;
   $call .= ' -a ' . $_ foreach @anno;
@@ -167,7 +177,6 @@
   my %skip;
   $skip{lc($_)} = 1 foreach @skip;
 
-
   # Ignore processing
   if (!$overwrite && $output && -e $output) {
     $log->trace($output . ' already exists');
@@ -193,7 +202,12 @@
   $input =~ s{([^/])$}{$1/};
   my $doc = KorAP::XML::Krill->new(
     path => $input,
-    meta_type => ($meta // 'I5')
+    meta_type => ($meta // 'I5'),
+    cache => Cache::FastMmap->new(
+      share_file => $cache_file,
+      cache_size => $cache_size,
+      init_file => $cache_init
+    )
   );
 
   unless ($doc->parse) {
@@ -306,6 +320,9 @@
     print $print_text . "\n";
   };
 
+  # Delete cache file
+  unlink($cache_file) if $cache_delete;
+
   stop_time;
 }
 
@@ -375,6 +392,15 @@
   my $t;
   print "Reading data ...\n";
 
+  unless (Cache::FastMmap->new(
+    share_file => $cache_file,
+    cache_size => $cache_size,
+    init_file => $cache_init
+  )) {
+    print "Unable to intialize cache '$cache_file'\n\n";
+    exit(1);
+  };
+
   # Input is a directory
   if (-d $input) {
     my $it = Directory::Iterator->new($input);
@@ -492,6 +518,9 @@
 
   $pool->wait_all_children;
 
+  # Delete cache file
+  unlink($cache_file) if $cache_delete;
+
   print "Done.\n";
   print timestr(timediff(Benchmark->new, $t))."\n\n";
 }
@@ -601,7 +630,7 @@
 
 Define the number of concurrent jobs in seperated forks
 for archive processing.
-Defaults to C<0>.
+Defaults to C<0> (everything runs in a single process).
 This is I<experimental>.
 
 =item B<--meta|-m>
@@ -620,6 +649,27 @@
 Compress the output.
 Expects a defined C<output> file in single processing.
 
+=item B<--cache|-c>
+
+File to mmap a cache (using L<Cache::FastMmap>).
+Defaults to C<korapxml2krill.cache> in the calling directory.
+
+=item B<--cache-size|-cs>
+
+Size of the cache. Defaults to C<50m>.
+
+=item B<--cache-init|-ci>
+
+Initialize cache file.
+Can be flagged using C<--no-cache-init> as well.
+Defaults to C<true>.
+
+=item B<--cache-delete|-cd>
+
+Delete cache file after processing.
+Can be flagged using C<--no-cache-delete> as well.
+Defaults to C<true>.
+
 =item B<--sigle|-sg>
 
 Extract the given text sigles.