Fix dealing with no-span layers|Improve error messages for bughunting
diff --git a/Makefile.PL b/Makefile.PL
index db22f55..804bcaf 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -15,6 +15,7 @@
     'Log::Log4perl' => 1.42,
     'JSON::XS'      => 3.01,
     'Set::Scalar'   => 1.26,
+    'IO::Dir::Recursive' => 0,
     'Benchmark'     => 0,
     'Carp'          => 0,
     'strict'        => 0,
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 6153463..d7d3abf 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -113,6 +113,10 @@
   my $day = $dom->at("pubDate[type=day]");
   $day = $day ? $day->text : 0;
 
+  $year = 0  if $year  !~ /^\d+$/;
+  $month = 0 if $month !~ /^\d+$/;
+  $day = 0   if $day   !~ /^\d+$/;
+
   my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
   $date .= length($month) == 1 ? '0' . $month : $month;
   $date .= length($day) == 1 ? '0' . $day : $day;
diff --git a/lib/KorAP/Document/Primary.pm b/lib/KorAP/Document/Primary.pm
index be0a234..8566896 100644
--- a/lib/KorAP/Document/Primary.pm
+++ b/lib/KorAP/Document/Primary.pm
@@ -1,7 +1,7 @@
 package KorAP::Document::Primary;
 use strict;
 use warnings;
-use Carp qw/croak/;
+use Carp qw/croak carp/;
 use Mojo::ByteStream 'b';
 use feature 'state';
 use Packed::Array;
@@ -34,7 +34,8 @@
     return $substr;
   };
   # encode 'UTF-8',
-  croak 'Unable to find substring';
+  carp 'Unable to find substring';
+  return;
 };
 
 
@@ -55,7 +56,7 @@
     return b($substr)->decode;
   };
   # encode 'UTF-8',
-  croak 'Unable to find substring';
+  carp 'Unable to find substring';
 };
 
 
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index 338a748..6b45f63 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -22,7 +22,7 @@
     }
   ) or return;
 
-  $$self->stream->add_meta('paragraph', '<i>' . $i);
+  $$self->stream->add_meta('paragraphs', '<i>' . $i);
 
   return 1;
 };
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index d6d673e..55ecbbc 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -40,7 +40,7 @@
 	  foreach (split '\|', $found) {
 	    my ($x, $y) = split "=", $_;
 	    # case, tense, number, mood, person, degree, gender
-	    $mtt->add(term => 'mate/m:' . $x . ':' . $y);
+	    $mtt->add(term => 'mate/m:' . $x . ($y ? ':' . $y : ''));
 	  };
 	};
       };
diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index 0b774f7..3e60d9d 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm
@@ -21,7 +21,7 @@
     }
   ) or return;
 
-  $$self->stream->add_meta('sentence', '<i>' . $i);
+  $$self->stream->add_meta('sentences', '<i>' . $i);
 
   return 1;
 };
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 3b29424..b60e724 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -24,7 +24,8 @@
 
   # Create new token stream
   my $mtts = KorAP::Field::MultiTermTokenStream->new;
-  my $file = b($self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml')->slurp;
+  my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
+  my $file = b($path)->slurp;
   my $tokens = Mojo::DOM->new($file);
   $tokens->xml(1);
 
@@ -48,6 +49,11 @@
       my $to = $span->attr('to');
       my $token = $doc->primary->data($from, $to);
 
+      unless ($token) {
+	  $self->log->error("Unable to find substring [$from-$to] in $path");
+	  return;
+      };
+
       $should++;
 
       # Ignore non-word tokens
@@ -81,7 +87,7 @@
     });
 
   # Add token count
-  $mtts->add_meta('token', '<i>' . $have);
+  $mtts->add_meta('tokens', '<i>' . $have);
 
   $range->gap($old, $doc->primary->data_length, $have-1) if $doc->primary->data_length >= $old;
 
diff --git a/lib/KorAP/Tokenizer/Match.pm b/lib/KorAP/Tokenizer/Match.pm
index 2a06aea..bae1edf 100644
--- a/lib/KorAP/Tokenizer/Match.pm
+++ b/lib/KorAP/Tokenizer/Match.pm
@@ -13,14 +13,17 @@
 };
 
 sub lookup {
+  return undef unless $_[1] && $_[2];
   $_[0]->{$_[1] . ':' . $_[2]} // undef;
 };
 
 sub startswith {
+  return undef unless $_[1];
   $_[0]->{'[' . $_[1]} // undef;
 };
 
 sub endswith {
+  return undef unless $_[1];
   $_[0]->{$_[1] . ']'} // undef;
 };
 
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 5e5cec6..1b69624 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -4,19 +4,47 @@
 use Mojo::DOM;
 use Mojo::ByteStream 'b';
 use XML::Fast;
+use Try::Tiny;
 
 has 'range';
 
+has 'log' => sub {
+  Log::Log4perl->get_logger(__PACKAGE__)
+};
+
 sub parse {
   my $self = shift;
-  my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
+  my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
+  my $file = b($path)->slurp;
 
   # my $spans = Mojo::DOM->new($file);
   # $spans->xml(1);
 
   # my $spans = XML::LibXML->load_xml(string => $file);
 
-  my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
+  my $spans;
+
+  try {
+      local $SIG{__WARN__} = sub {
+	  my $msg = shift;
+	  $self->log->error('Error in ' . $path . ($msg ? ': ' . $msg : ''));
+      };
+
+      $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
+
+  }
+  catch  {
+      $self->log->error('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+      return [];
+  };
+
+  if (ref $spans && $spans->{span}) {
+      $spans = $spans->{span};
+  }
+  else {
+      return [];
+  };
+
   $spans = [$spans] if ref $spans ne 'ARRAY';
 
   my ($should, $have) = (0,0);
diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
index ceb2e4c..3e11d86 100644
--- a/lib/KorAP/Tokenizer/Tokens.pm
+++ b/lib/KorAP/Tokenizer/Tokens.pm
@@ -5,17 +5,42 @@
 use KorAP::Tokenizer::Token;
 use Carp qw/croak carp/;
 use XML::Fast;
+use Try::Tiny;
 
+has 'log' => sub {
+  Log::Log4perl->get_logger(__PACKAGE__)
+};
 
 sub parse {
   my $self = shift;
-  my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
+  my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
+  my $file = b($path)->slurp;
 
 #  my $spans = Mojo::DOM->new($file);
 #  $spans->xml(1);
-  my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
-  $spans = [$spans] if ref $spans ne 'ARRAY';
 
+  my $spans;
+  try {
+      local $SIG{__WARN__} = sub {
+	  my $msg = shift;
+	  $self->log->error('Error in ' . $path . ($msg ? ': ' . $msg : ''));
+      };
+
+      $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
+  }
+  catch  {
+      $self->log->error('Span error in ' . $path . ($_ ? ': ' . $_ : ''));
+      return [];
+  };
+
+  if (ref $spans && $spans->{span}) {
+      $spans = $spans->{span};
+  }
+  else {
+      return [];
+  };
+
+  $spans = [$spans] if ref $spans ne 'ARRAY';
 
   my ($should, $have) = (0,0);
 
diff --git a/lib/KorAP/Tokenizer/Units.pm b/lib/KorAP/Tokenizer/Units.pm
index 74bafa9..5deead0 100644
--- a/lib/KorAP/Tokenizer/Units.pm
+++ b/lib/KorAP/Tokenizer/Units.pm
@@ -3,7 +3,9 @@
 use KorAP::Tokenizer::Token;
 use Mojo::Base -base;
 
-has [qw/path foundry layer match range primary should have/];
+has [qw/path foundry layer match range primary/];
+has 'should' => 0;
+has 'have' => 0;
 has 'encoding' => 'utf-8';
 
 sub span {
diff --git a/script/prepare_index.pl b/script/prepare_index.pl
index 21d8dd9..8acdc80 100644
--- a/script/prepare_index.pl
+++ b/script/prepare_index.pl
@@ -20,7 +20,7 @@
 Merge foundry data based on a tokenization and create indexer friendly documents.
 
 Call:
-prepare_index.pl -z --input <directory> --outputfile <filename>
+prepare_index.pl -z --input <directory> --output <filename>
 
 --input|-i <directory>          Directory of the document to index
 --output|-o <filename>          Document name for output (optional),
@@ -54,7 +54,7 @@
 # Options from the command line
 my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty);
 GetOptions(
-  'input|x=s'   => \$input,
+  'input|i=s'   => \$input,
   'output|o=s'  => \$output,
   'human|m'     => \$text,
   'token|t=s'   => \$token_base,
diff --git a/script/wrap_folders.pl b/script/wrap_folders.pl
new file mode 100644
index 0000000..979e1bd
--- /dev/null
+++ b/script/wrap_folders.pl
@@ -0,0 +1,98 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use FindBin;
+use v5.16;
+use Getopt::Long;
+use Directory::Iterator;
+
+my $local = $FindBin::Bin;
+
+sub printhelp {
+  print <<'EOHELP';
+
+Merge foundry data based on a tokenization and create indexer friendly documents
+for whole directories.
+
+Call:
+wrap_folders.pl -z --input <directory> --output <directory>
+
+--input|-i <directory>          Directory of documents to index
+--output|-o <directory>         Name of output folder
+--token|-t <foundry>[#<layer>]  Define the default tokenization by specifying
+                                the name of the foundry and optionally the name
+                                of the layer. Defaults to OpenNLP#tokens.
+--skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name
+                                or specific layers by defining the name
+                                with a # in front of the foundry,
+                                e.g. Mate#Morpho. Alternatively you can skip #ALL.
+                                Can be set multiple times.
+--allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them
+                                combining the foundry name with a # and the layer name.
+--primary|-p                    Output primary data or not. Defaults to true.
+                                Can be flagged using --no-primary as well.
+--human|-m                      Represent the data human friendly,
+                                while the output defaults to JSON
+--pretty|-y                     Pretty print json output
+--gzip|-z                       Compress the output
+                                (expects a defined output file)
+--log|-l                        The Log4perl log level, defaults to ERROR.
+--help|-h                       Print this document (optional)
+
+diewald@ids-mannheim.de, 2013/11/25
+
+EOHELP
+
+  exit(defined $_[0] ? $_[0] : 0);
+};
+
+my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty);
+GetOptions(
+  'input|i=s'   => \$input,
+  'output|o=s'  => \$output,
+  'human|m'     => \$text,
+  'token|t=s'   => \$token_base,
+  'gzip|z'      => \$gzip,
+  'skip|s=s'    => \@skip,
+  'log|l=s'     => \$log_level,
+  'allow|a=s'   => \@allow,
+  'primary|p!'  => \$primary,
+  'pretty|y'    => \$pretty,
+  'help|h'      => sub { printhelp }
+);
+
+printhelp(1) if !$input || !$output;
+
+
+sub write_file {
+  my $anno = shift;
+  my $file = $anno;
+  $file =~ s/^?\/?$input//;
+  $file =~ tr/\//-/;
+  $file =~ s{^-+}{};
+
+  my $call = 'perl ' . $local . '/prepare_index.pl -i ' . $anno . ' -o ' . $output . '/' . $file . '.json';
+  $call .= '.gz -z' if $gzip;
+  $call .= ' -m' if $text;
+  $call .= ' -l ' . $log_level if $log_level;
+  $call .= ' --no-primary ' if $primary;
+  $call .= ' -y ' . $pretty if $pretty;
+  $call .= ' -a ' . $_ foreach @allow;
+  $call .= ' -s ' . $_ foreach @skip;
+  system($call);
+};
+
+
+my $it = Directory::Iterator->new($input);
+my $dir;
+while (1) {
+
+    if (!$it->is_directory && ($dir = $it->get) && $dir =~ s{/data\.xml$}{}) {
+	write_file($dir);
+	$it->prune;
+    };
+  last unless $it->next;
+};
+
+
+__END__