Fixed primary data problems, speedup using moar C and now provide layer info
diff --git a/Makefile.PL b/Makefile.PL
index 804bcaf..7ee0a7a 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -21,7 +21,8 @@
     'strict'        => 0,
     'warnings'      => 0,
     'utf8'          => 0,
-    'bytes'         => 0
+    'bytes'         => 0,
+    'List::MoreUtils' => 0.33
   },
 #  LICENSE      => 'perl',
   MIN_PERL_VERSION => '5.016',
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 5b4ef73..2cb023a 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -1,9 +1,9 @@
 package KorAP::Document;
 use Mojo::Base -base;
 use v5.16;
-
 use Mojo::ByteStream 'b';
-use Mojo::DOM;
+use XML::Fast;
+use Try::Tiny;
 use Carp qw/croak/;
 use KorAP::Document::Primary;
 
@@ -19,17 +19,26 @@
   my $self = shift;
   my $file = b($self->path . 'data.xml')->slurp;
 
+  my ($rt, $error);
   state $unable = 'Unable to parse document ' . $self->path;
+  try {
+      local $SIG{__WARN__} = sub {
+	  $error = 1;
+      };
+      $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
+  }
+  catch  {
+      $self->log->warn($unable);
+      $error = 1;
+  };
+
+  return if $error;
 
   $self->log->debug('Parse document ' . $self->path);
 
-  my $dom = Mojo::DOM->new($file);
-
-  my $rt = $dom->at('raw_text');
-
   # Get document id and corpus id
-  if ($rt && $rt->attr('docid')) {
-    $self->id($rt->attr('docid'));
+  if ($rt && $rt->{'-docid'}) {
+    $self->id($rt->{'-docid'});
     if ($self->id =~ /^([^_]+)_/) {
       $self->corpus_id($1);
     }
@@ -42,11 +51,9 @@
   };
 
   # Get primary data
-  my $pd = $rt->at('text');
+  my $pd = $rt->{text};
   if ($pd) {
-
-    $pd = b($pd->text)->decode;
-    $self->{pd} = KorAP::Document::Primary->new($pd->to_string);
+    $self->{pd} = KorAP::Document::Primary->new($pd);
   }
   else {
     croak $unable;
diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 7675737..2064e6e 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm
@@ -1,6 +1,7 @@
 package KorAP::Field::MultiTermToken;
 use KorAP::Field::MultiTerm;
 use Mojo::Base -base;
+use List::MoreUtils 'uniq';
 
 has [qw/o_start o_end/];
 
@@ -33,7 +34,7 @@
 
 sub to_array {
   my $self = shift;
-  [map($_->to_string, @{$self->{mt}})];
+  [uniq(map($_->to_string, @{$self->{mt}}))];
 };
 
 1;
diff --git a/lib/KorAP/Index/Base.pm b/lib/KorAP/Index/Base.pm
index cc53420..7e447d6 100644
--- a/lib/KorAP/Index/Base.pm
+++ b/lib/KorAP/Index/Base.pm
@@ -24,4 +24,8 @@
   bless \$tokens, $class;
 };
 
+sub layer_info {
+    return []
+};
+
 1;
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index 6b45f63..595c527 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -1,8 +1,6 @@
 package KorAP::Index::Base::Paragraphs;
 use KorAP::Index::Base;
 
-
-
 sub parse {
   my $self = shift;
   my $i = 0;
@@ -27,4 +25,6 @@
   return 1;
 };
 
+
+
 1;
diff --git a/lib/KorAP/Index/Connexor/Morpho.pm b/lib/KorAP/Index/Connexor/Morpho.pm
index be20d33..1aaa991 100644
--- a/lib/KorAP/Index/Connexor/Morpho.pm
+++ b/lib/KorAP/Index/Connexor/Morpho.pm
@@ -58,5 +58,10 @@
   return 1;
 };
 
+sub layer_info {
+    ['cnx/l=lemma', 'cnx/p=pos', 'cnx/m=msd'];
+};
+
+
 
 1;
diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
index e540169..f6dbe99 100644
--- a/lib/KorAP/Index/Connexor/Phrase.pm
+++ b/lib/KorAP/Index/Connexor/Phrase.pm
@@ -19,7 +19,7 @@
       if ($type) {
 	my $mtt = $stream->pos($span->p_start);
 	$mtt->add(
-	  term => '<>:cnx/const:' . $type,
+	  term => '<>:cnx/c:' . $type,
 	  o_start => $span->o_start,
 	  o_end => $span->o_end,
 	  p_end => $span->p_end
@@ -31,5 +31,9 @@
   return 1;
 };
 
+sub layer_info {
+    ['cnx/c=const'];
+};
+
 
 1;
diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
index 430dff5..f840d1e 100644
--- a/lib/KorAP/Index/Mate/Dependency.pm
+++ b/lib/KorAP/Index/Mate/Dependency.pm
@@ -52,5 +52,9 @@
   return 1;
 };
 
+sub layer_info {
+    ['mate/d=dep']
+};
+
 
 1;
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index 55ecbbc..663224a 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -49,5 +49,8 @@
   return 1;
 };
 
+sub layer_info {
+    ['mate/l=lemma', 'mate/p=pos', 'mate/m=msd']
+}
 
 1;
diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
index f03b17f..2de5042 100644
--- a/lib/KorAP/Index/OpenNLP/Morpho.pm
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm
@@ -27,4 +27,8 @@
   return 1;
 };
 
+sub layer_info {
+    ['opennlp/p=pos'];
+};
+
 1;
diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
index e4de00e..d83267a 100644
--- a/lib/KorAP/Index/TreeTagger/Morpho.pm
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm
@@ -44,5 +44,8 @@
   return 1;
 };
 
+sub layer_info {
+    ['tt/p=pos', 'tt/l=lemma']
+};
 
 1;
diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index 7b70424..341a9da 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm
@@ -48,7 +48,7 @@
 
     # $type is now NPA, NP, NUM ...
     my %term = (
-      term => '<>:xip/const:' . $type,
+      term => '<>:xip/c:' . $type,
       o_start => $span->o_start,
       o_end => $span->o_end,
       p_end => $span->p_end
@@ -79,5 +79,8 @@
   return 1;
 };
 
+sub layer_info {
+    ['xip/c=const']
+}
 
 1;
diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
index 98e121e..20ab151 100644
--- a/lib/KorAP/Index/XIP/Dependency.pm
+++ b/lib/KorAP/Index/XIP/Dependency.pm
@@ -54,5 +54,9 @@
   return 1;
 };
 
+sub layer_info {
+    ['xip/d=dep']
+}
+
 
 1;
diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
index 83b484e..45e7c9a 100644
--- a/lib/KorAP/Index/XIP/Morpho.pm
+++ b/lib/KorAP/Index/XIP/Morpho.pm
@@ -57,5 +57,9 @@
   return 1;
 };
 
+sub layer_info {
+    ['xip/l=lemma', 'xip/p=pos']
+};
+
 
 1;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 3f2c7f2..e5c7b51 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -2,12 +2,15 @@
 use Mojo::Base -base;
 use Mojo::ByteStream 'b';
 use Mojo::Loader;
+use XML::Fast;
+use Try::Tiny;
 use Carp qw/croak/;
 use KorAP::Tokenizer::Range;
 use KorAP::Tokenizer::Match;
 use KorAP::Tokenizer::Spans;
 use KorAP::Tokenizer::Tokens;
 use KorAP::Field::MultiTermTokenStream;
+use List::MoreUtils 'uniq';
 use JSON::XS;
 use Log::Log4perl;
 
@@ -26,8 +29,8 @@
   my $mtts = KorAP::Field::MultiTermTokenStream->new;
   my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
   my $file = b($path)->slurp;
-  my $tokens = Mojo::DOM->new($file);
-  $tokens->xml(1);
+#  my $tokens = Mojo::DOM->new($file);
+#  $tokens->xml(1);
 
   my $doc = $self->doc;
 
@@ -41,25 +44,58 @@
 
   $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);
 
+  # TODO: Reuse the following code from Spans.pm and tokens.pm
+  my ($tokens, $error);
+  try {
+      local $SIG{__WARN__} = sub {
+	  $error = 1;
+      };
+      $tokens = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
+  }
+  catch  {
+      $self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
+      $error = 1;
+  };
+
+  return if $error;
+
+  if (ref $tokens && $tokens->{span}) {
+      $tokens = $tokens->{span};
+  }
+  else {
+      return [];
+  };
+
+  $tokens = [$tokens] if ref $tokens ne 'ARRAY';
+
   # Iterate over all tokens
-  $tokens->find('span')->each(
-    sub {
-      my $span = $_;
-      my $from = $span->attr('from');
-      my $to = $span->attr('to');
+  # $tokens->find('span')->each(
+  #    sub {
+  # my $span = $_;
+  foreach my $span (@$tokens) {
+      my $from = $span->{'-from'};
+      my $to = $span->{'-to'};
       my $token = $doc->primary->data($from, $to);
 
       # warn 'Has ' . $from . '->' . $to . "($old)";
 
       unless (defined $token) {
 	  $self->log->error("Unable to find substring [$from-$to] in $path");
-	  return;
+	  next;
       };
 
       $should++;
 
       # Ignore non-word tokens
-      return if $token !~ /[\w\d]/;
+      next if $token !~ /[\w\d]/;
+
+#      my $limit = 40;
+#      if ($should > $limit) {
+#	  warn $token;
+#      };
+#      if ($should > $limit+20) {
+#	  die;
+#      };
 
       my $mtt = $mtts->add;
 
@@ -86,7 +122,7 @@
       $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
 
       $have++;
-    });
+  };
 
   # Add token count
   $mtts->add_meta('tokens', '<i>' . $have);
@@ -213,7 +249,12 @@
 
   if ($mod->can('new') || eval("require $mod; 1;")) {
     if (my $retval = $mod->new($self)->parse(@_)) {
+
+      # This layer is supported
       $self->support($foundry => $layer, @_);
+
+      # Get layerinfo
+      $self->layer_info($mod->layer_info);
       return $retval;
     };
   }
@@ -270,6 +311,17 @@
   push(@{$self->{support}->{$f}}, [$l, @info]);
 };
 
+sub layer_info {
+    my $self = shift;
+    $self->{layer_info} //= [];
+    if ($_[0]) {
+	push(@{$self->{layer_info}}, @{$_[0]});
+    }
+    else {
+	return join ' ', uniq @{$self->{layer_info}};
+    };
+};
+
 
 sub to_string {
   my $self = shift;
@@ -290,6 +342,10 @@
       $string .= 'support = ' . $foundry . '#' . join(',', @{$_}) . "\n";
     };
   };
+  foreach my $layer_info (keys %{$self->layer_info}) {
+    $string .= 'layer_info = ' . $_ . "\n";
+  };
+
   $string .= "</info>\n";
   $string .= $self->stream->to_string;
   $string .= "</field>";
@@ -308,7 +364,8 @@
     name => $self->name,
     data => $self->stream->to_array,
     tokenization => lc($self->foundry) . '#' . lc($self->layer),
-    foundries => $self->support
+    foundries => $self->support,
+    layerInfo => $self->layer_info
   });
 
   $data{fields} = \@fields;
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 045cf76..d5c6dfa 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -1,7 +1,7 @@
 package KorAP::Tokenizer::Spans;
 use Mojo::Base 'KorAP::Tokenizer::Units';
 use KorAP::Tokenizer::Span;
-use Mojo::DOM;
+# use Mojo::DOM;
 use Mojo::ByteStream 'b';
 use XML::Fast;
 use Try::Tiny;
@@ -20,11 +20,6 @@
 
   my $file = b($path)->slurp;
 
-  # my $spans = Mojo::DOM->new($file);
-  # $spans->xml(1);
-
-  # my $spans = XML::LibXML->load_xml(string => $file);
-
   my ($spans, $error);
   try {
       local $SIG{__WARN__} = sub {