Add metadata caching

Change-Id: Ic3fd0d353c66a8ae3732de7f6d342ed159f80b16
diff --git a/script/korapxml2krill b/script/korapxml2krill
index c5db742..99dd333 100644
--- a/script/korapxml2krill
+++ b/script/korapxml2krill
@@ -9,6 +9,7 @@
 use IO::Compress::Gzip qw/$GzipError/;
 use Log::Log4perl;
 use Pod::Usage;
+use Cache::FastMmap;
 use Directory::Iterator;
 use KorAP::XML::Krill;
 use KorAP::XML::Archive;
@@ -49,6 +50,9 @@
 #
 # 2016/03/17
 # - Added meta switch
+#
+# 2016/03/18
+# - Added meta data caching
 # ----------------------------------------------------------
 
 our $LAST_CHANGE = '2016/03/17';
@@ -73,12 +77,15 @@
   'input|i=s'   => \(my $input),
   'output|o=s'  => \(my $output),
   'overwrite|w' => \(my $overwrite),
-#  'human|m'     => \(my $text),
   'meta|m=s'    => \(my $meta),
   'token|t=s'   => \(my $token_base),
   'gzip|z'      => \(my $gzip),
   'skip|s=s'    => \@skip,
   'sigle|sg=s'  => \@sigle,
+  'cache|c=s'   => \(my $cache_file = 'korapxml2krill.cache'),
+  'cache-size|cs=s'   => \(my $cache_size = '50m'),
+  'cache-delete|cd!' => \(my $cache_delete = 1),
+  'cache-init|ci!'   => \(my $cache_init = 1),
   'log|l=s'     => \(my $log_level = 'ERROR'),
   'anno|a=s'    => \(my @anno),
   'primary|p!'  => \(my $primary),
@@ -141,11 +148,14 @@
   my $call = 'perl ' . $LOCAL . '/korapxml2krill -i ' .
     $anno . ' -o ' . $output . '/' . $file . '.json';
   $call .= '.gz -z' if $gzip;
-#  $call .= ' -m' if $text;
   $call .= ' -m ' . $meta if $meta;
   $call .= ' -w' if $overwrite;
   $call .= ' -t ' . $token_base if $token_base;
   $call .= ' -l ' . $log_level if $log_level;
+  $call .= ' -c ' . $cache_file;
+  $call .= ' -cs ' . $cache_size;
+  $call .= ' --no-cache-delete'; # Don't delete the cache
+  $call .= ' --no-cache-init'; # Don't initialize the cache
   $call .= ' --no-primary ' if $primary;
   $call .= ' -y ' . $pretty if $pretty;
   $call .= ' -a ' . $_ foreach @anno;
@@ -167,7 +177,6 @@
   my %skip;
   $skip{lc($_)} = 1 foreach @skip;
 
-
   # Ignore processing
   if (!$overwrite && $output && -e $output) {
     $log->trace($output . ' already exists');
@@ -193,7 +202,12 @@
   $input =~ s{([^/])$}{$1/};
   my $doc = KorAP::XML::Krill->new(
     path => $input,
-    meta_type => ($meta // 'I5')
+    meta_type => ($meta // 'I5'),
+    cache => Cache::FastMmap->new(
+      share_file => $cache_file,
+      cache_size => $cache_size,
+      init_file => $cache_init
+    )
   );
 
   unless ($doc->parse) {
@@ -306,6 +320,9 @@
     print $print_text . "\n";
   };
 
+  # Delete cache file
+  unlink($cache_file) if $cache_delete;
+
   stop_time;
 }
 
@@ -375,6 +392,15 @@
   my $t;
   print "Reading data ...\n";
 
+  unless (Cache::FastMmap->new(
+    share_file => $cache_file,
+    cache_size => $cache_size,
+    init_file => $cache_init
+  )) {
+    print "Unable to intialize cache '$cache_file'\n\n";
+    exit(1);
+  };
+
   # Input is a directory
   if (-d $input) {
     my $it = Directory::Iterator->new($input);
@@ -492,6 +518,9 @@
 
   $pool->wait_all_children;
 
+  # Delete cache file
+  unlink($cache_file) if $cache_delete;
+
   print "Done.\n";
   print timestr(timediff(Benchmark->new, $t))."\n\n";
 }
@@ -601,7 +630,7 @@
 
 Define the number of concurrent jobs in seperated forks
 for archive processing.
-Defaults to C<0>.
+Defaults to C<0> (everything runs in a single process).
 This is I<experimental>.
 
 =item B<--meta|-m>
@@ -620,6 +649,27 @@
 Compress the output.
 Expects a defined C<output> file in single processing.
 
+=item B<--cache|-c>
+
+File to mmap a cache (using L<Cache::FastMmap>).
+Defaults to C<korapxml2krill.cache> in the calling directory.
+
+=item B<--cache-size|-cs>
+
+Size of the cache. Defaults to C<50m>.
+
+=item B<--cache-init|-ci>
+
+Initialize cache file.
+Can be flagged using C<--no-cache-init> as well.
+Defaults to C<true>.
+
+=item B<--cache-delete|-cd>
+
+Delete cache file after processing.
+Can be flagged using C<--no-cache-delete> as well.
+Defaults to C<true>.
+
 =item B<--sigle|-sg>
 
 Extract the given text sigles.