Indexation script finished

commit: 7364d1f29b898a41bb2074a34ce671f7c36ca5de [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Tue Nov 05 19:26:35 2013 +0000
committer: Nils Diewald <nils@diewald-online.de> Tue Nov 05 19:26:35 2013 +0000
tree: 94575601cacd010be2d075a2b8a6e1926fdb60f1
parent: 2db9ad0d484214d641313937810225a635a997a4 [diff]
diff --git a/Makefile.PL b/Makefile.PL
index b076284..db22f55 100644
--- a/Makefile.PL
+++ b/Makefile.PL

@@ -13,11 +13,14 @@
     'Mojolicious'   => 4.51,
     'Packed::Array' => 0.01,
     'Log::Log4perl' => 1.42,
-    'Carp'         => 0,
-    'strict'       => 0,
-    'warnings'     => 0,
-    'utf8'         => 0,
-    'bytes'        => 0
+    'JSON::XS'      => 3.01,
+    'Set::Scalar'   => 1.26,
+    'Benchmark'     => 0,
+    'Carp'          => 0,
+    'strict'        => 0,
+    'warnings'      => 0,
+    'utf8'          => 0,
+    'bytes'         => 0
   },
 #  LICENSE      => 'perl',
   MIN_PERL_VERSION => '5.016',

diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 8f80e6c..4a07f8e 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm

@@ -4,11 +4,15 @@
 
 use Mojo::ByteStream 'b';
 use Mojo::DOM;
-use Carp qw/croak carp/;
+use Carp qw/croak/;
 use KorAP::Document::Primary;
 
-has [qw/id corpus_id path/];
-has [qw/pub_date title sub_title pub_place/];
+our @ATTR = qw/id corpus_id pub_date
+	       title sub_title pub_place/;
+has 'path';
+has [@ATTR];
+
+has log => sub { Log::Log4perl->get_logger(__PACKAGE__) };
 
 # parse document
 sub parse {
@@ -17,7 +21,7 @@
 
   state $unable = 'Unable to parse document';
 
-  carp 'Parse document ' . $self->path;
+  $self->log->trace('Parse document ' . $self->path);
 
   my $dom = Mojo::DOM->new($file);
 
@@ -84,7 +88,7 @@
 sub _parse_meta {
   my $self = shift;
 
-  my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1')->encode;
+  my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
 
   my $dom = Mojo::DOM->new($file);
   my $monogr = $dom->at('monogr');
@@ -126,6 +130,55 @@
   $self->text_class(@topic);
 };
 
+sub to_string {
+  my $self = shift;
+
+  my $string;
+
+  foreach (@ATTR) {
+    if (my $att = $self->$_) {
+      $att =~ s/\n/ /g;
+      $att =~ s/\s\s+/ /g;
+      $string .= $_ . ' = ' . $att . "\n";
+    };
+  };
+
+  if ($self->author) {
+    foreach (@{$self->author}) {
+      $_ =~ s/\n/ /g;
+      $_ =~ s/\s\s+/ /g;
+      $string .= 'author = ' . $_ . "\n";
+    };
+  };
+
+  if ($self->text_class) {
+    foreach (@{$self->text_class}) {
+      $string .= 'text_class = ' . $_ . "\n";
+    };
+  };
+
+  return $string;
+};
+
+
+sub to_hash {
+  my $self = shift;
+
+  my %hash;
+
+  foreach (@ATTR, 'author', 'text_class') {
+    if (my $att = $self->$_) {
+      $att =~ s/\n/ /g;
+      $att =~ s/\s\s+/ /g;
+      $hash{$_} = $att;
+    };
+  };
+
+  return \%hash;
+};
+
+
+
 
 1;
 

diff --git a/lib/KorAP/Document/Primary.pm b/lib/KorAP/Document/Primary.pm
index 52ca844..be0a234 100644
--- a/lib/KorAP/Document/Primary.pm
+++ b/lib/KorAP/Document/Primary.pm

@@ -5,7 +5,10 @@
 use Mojo::ByteStream 'b';
 use feature 'state';
 use Packed::Array;
+use utf8;
 
+# our $QUOT = b("„“”")->decode;
+our $QUOT_RE = qr/[„“”]/;
 
 # Constructor
 sub new {
@@ -19,13 +22,37 @@
   my $self = shift;
   my ($from, $to) = @_;
 
-  return b(substr($self->[0], $from))->encode if $from && !$to;
+#  return b(substr($self->[0], $from))->encode if $from && !$to;
+  return substr($self->[0], $from) if $from && !$to;
 
-  return b($self->[0])->encode unless $to;
+#  return b($self->[0])->encode unless $to;
+  return $self->[0] unless $to;
 
   my $substr = substr($self->[0], $from, $to - $from);
   if ($substr) {
-    return b($substr)->encode;
+#    return b($substr)->encode;
+    return $substr;
+  };
+  # encode 'UTF-8',
+  croak 'Unable to find substring';
+};
+
+
+sub data_bytes {
+  my $self = shift;
+  my ($from, $to) = @_;
+
+  use bytes;
+
+  return b(substr($self->[0], $from))->decode if $from && !$to;
+
+#  return b($self->[0])->encode unless $to;
+  return b($self->[0])->decode unless $to;
+
+  my $substr = substr($self->[0], $from, $to - $from);
+  if ($substr) {
+#    return b($substr)->encode;
+    return b($substr)->decode;
   };
   # encode 'UTF-8',
   croak 'Unable to find substring';
@@ -45,17 +72,31 @@
 sub bytes2chars {
   my $self = shift;
   unless ($self->[2]) {
-    $self->_calc_chars;
+    $self->[2] = $self->_calc_chars($self->[0]);
   };
   return $self->[2]->[shift];
 };
 
+# Get correct offset
+sub xip2chars {
+  my $self = shift;
+  unless ($self->[3]) {
+    my $buffer = $self->[0];
+
+    # Hacky work around: replace fancy quotation marks for XIP
+    $buffer =~ s{$QUOT_RE}{"}g;
+
+    $self->[3] = $self->_calc_chars($buffer);
+  };
+  return $self->[3]->[shift];
+};
 
 # Calculate character offsets
 sub _calc_chars {
   use bytes;
-
   my $self = shift;
+  my $text = shift;
+
   tie my @array, 'Packed::Array';
 
   state $leading = pack( 'B8', '10000000' );
@@ -65,14 +106,14 @@
   my $c;
 
   # Init array
-  my $l = length($self->[0]);
+  my $l = length($text);
   $array[$l-1] = 0;
 
   # Iterate over every character
-  while ($i < $l) {
+  while ($i <= $l) {
 
     # Get actual character
-    $c = substr($self->[0], $i, 1);
+    $c = substr($text, $i, 1);
 
     # store character position
     $array[$i++] = $j;
@@ -81,7 +122,7 @@
     if (ord($c & $leading) && ord($c & $start)) {
 
       # Get the next byte - expecting a following character
-      $c = substr($self->[0], $i, 1);
+      $c = substr($text, $i, 1);
 
       # Character is part of a multibyte
       while (ord($c & $leading)) {
@@ -90,14 +131,13 @@
 	$array[$i] = (ord($c & $start)) ? ++$j : $j;
 
 	# Get next character
-	$c = substr($self->[0], ++$i, 1);
+	$c = substr($text, ++$i, 1);
       };
     };
 
     $j++;
   };
-
-  $self->[2] = \@array;
+  return \@array;
 };
 
 

diff --git a/lib/KorAP/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
similarity index 95%
rename from lib/KorAP/MultiTerm.pm
rename to lib/KorAP/Field/MultiTerm.pm
index 9dd12a9..7381a0d 100644
--- a/lib/KorAP/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm

@@ -1,4 +1,4 @@
-package KorAP::MultiTerm;
+package KorAP::Field::MultiTerm;
 use Mojo::Base -base;
 
 has [qw/p_start p_end o_start o_end term payload/];

diff --git a/lib/KorAP/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
similarity index 65%
rename from lib/KorAP/MultiTermToken.pm
rename to lib/KorAP/Field/MultiTermToken.pm
index 9df9811..7675737 100644
--- a/lib/KorAP/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm

@@ -1,5 +1,5 @@
-package KorAP::MultiTermToken;
-use KorAP::MultiTerm;
+package KorAP::Field::MultiTermToken;
+use KorAP::Field::MultiTerm;
 use Mojo::Base -base;
 
 has [qw/o_start o_end/];
@@ -9,10 +9,10 @@
   my $mt;
   unless (ref $_[0] eq 'MultiTerm') {
     if (@_ == 1) {
-      $mt = KorAP::MultiTerm->new(term => shift());
+      $mt = KorAP::Field::MultiTerm->new(term => shift());
     }
     else {
-      $mt = KorAP::MultiTerm->new(@_);
+      $mt = KorAP::Field::MultiTerm->new(@_);
     };
   }
   else {
@@ -31,4 +31,9 @@
   return $string;
 };
 
+sub to_array {
+  my $self = shift;
+  [map($_->to_string, @{$self->{mt}})];
+};
+
 1;

diff --git a/lib/KorAP/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
similarity index 68%
rename from lib/KorAP/MultiTermTokenStream.pm
rename to lib/KorAP/Field/MultiTermTokenStream.pm
index 6d7dc29..d2e66a8 100644
--- a/lib/KorAP/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm

@@ -1,12 +1,12 @@
-package KorAP::MultiTermTokenStream;
+package KorAP::Field::MultiTermTokenStream;
 use Mojo::Base -base;
-use KorAP::MultiTermToken;
+use KorAP::Field::MultiTermToken;
 
 has [qw/oStart oEnd/];
 
 sub add {
   my $self = shift;
-  my $mtt = shift // KorAP::MultiTermToken->new;
+  my $mtt = shift // KorAP::Field::MultiTermToken->new;
   $self->{mtt} //= [];
   push(@{$self->{mtt}}, $mtt);
   return $mtt;
@@ -30,4 +30,9 @@
   return join("\n" , map { $_->to_string } @{$self->{mtt}}) . "\n";
 };
 
+sub to_array {
+  my $self = shift;
+  [ map { $_->to_array } @{$self->{mtt}} ];
+};
+
 1;

diff --git a/lib/KorAP/Index/Base.pm b/lib/KorAP/Index/Base.pm
new file mode 100644
index 0000000..cc53420
--- /dev/null
+++ b/lib/KorAP/Index/Base.pm

@@ -0,0 +1,27 @@
+package KorAP::Index::Base;
+
+use strict;
+use warnings;
+
+sub import {
+  my $class = shift;
+  my $caller = caller;
+
+  no strict 'refs';
+
+  push @{"${caller}::ISA"}, $class;
+
+  strict->import;
+  warnings->import;
+  utf8->import;
+  feature->import(':5.10');
+};
+
+
+sub new {
+  my $class = shift;
+  my $tokens = shift;
+  bless \$tokens, $class;
+};
+
+1;

diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
new file mode 100644
index 0000000..35199d4
--- /dev/null
+++ b/lib/KorAP/Index/Base/Paragraphs.pm

@@ -0,0 +1,30 @@
+package KorAP::Index::Base::Paragraphs;
+use KorAP::Index::Base;
+
+
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+  $$self->add_spandata(
+    foundry => 'base',
+    layer => 'paragraph',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:p',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('p', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/Connexor/Morpho.pm b/lib/KorAP/Index/Connexor/Morpho.pm
new file mode 100644
index 0000000..74533bf
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Morpho.pm

@@ -0,0 +1,62 @@
+package KorAP::Index::Connexor::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'connexor',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f};
+
+      my $found;
+
+      my $features = $content->{fs}->{f};
+
+      for my $f (@$features) {
+
+      # Lemma
+	if (($f->{-name} eq 'lemma') && ($found = $f->{'#text'})) {
+	  if (index($found, "\N{U+00a0}") >= 0) {
+	    foreach (split(/\x{00A0}/, $found)) {
+	      $mtt->add(
+		term => 'cnx_l:' . $_
+	      );
+	    }
+	  }
+	  else {
+	    $mtt->add(
+	      term => 'cnx_l:' . $found
+	    );
+	  };
+	}
+
+	# POS
+	elsif (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
+	  $mtt->add(
+	    term => 'cnx_p:' . $found
+	  );
+
+	}
+	# MSD
+	# Todo: Look in the description!
+	elsif (($f->{-name} eq 'msd') && ($found = $f->{'#text'})) {
+	  foreach (split(':', $found)) {
+	    $mtt->add(
+	      term => 'cnx_m:' . $_
+	    );
+	  };
+	};
+      };
+    }
+  ) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
new file mode 100644
index 0000000..60fcc8e
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Phrase.pm

@@ -0,0 +1,35 @@
+package KorAP::Index::Connexor::Phrase;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_spandata(
+    foundry => 'connexor',
+    layer => 'phrase',
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      my $content = $span->hash->{fs}->{f};
+
+      return if $content->{-name} ne 'pos';
+
+      my $type = $content->{'#text'};
+
+      if ($type) {
+	my $mtt = $stream->pos($span->p_start);
+	$mtt->add(
+	  term => '<>:cnx_const:' . $type,
+	  o_start => $span->o_start,
+	  o_end => $span->o_end,
+	  p_end => $span->p_end
+	);
+      };
+    }
+  ) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/Connexor/Syntax.pm b/lib/KorAP/Index/Connexor/Syntax.pm
new file mode 100644
index 0000000..35f5079
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Syntax.pm

@@ -0,0 +1,31 @@
+package KorAP::Index::Connexor::Syntax;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'connexor',
+    layer => 'syntax',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $found;
+      my $spans = $token->hash->{fs}->{f}->{fs}->{f};
+
+
+      # syntax
+      foreach (@$spans) {
+	if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
+	  $mtt->add(
+	    term => 'cnx_syn:' . $found
+	  );
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/CoreNLP/NamedEntities.pm b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
new file mode 100644
index 0000000..8596783
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/NamedEntities.pm

@@ -0,0 +1,32 @@
+package KorAP::Index::CoreNLP::NamedEntities;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $model = shift;
+
+  $$self->add_tokendata(
+    foundry => 'corenlp',
+    layer => $model,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f} or return;
+      my $found;
+
+      if (($content->{-name} eq 'ne') &&
+	    ($found = $content->{fs}) &&
+	      ($found = $found->{f}) &&
+		($found->{-name} eq 'ent') &&
+		  ($found = $found->{'#text'})) {
+	$mtt->add(
+	  term => 'corenlp_' . $model . ':' . $found
+	);
+      };
+    }) or return;
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
new file mode 100644
index 0000000..d300363
--- /dev/null
+++ b/lib/KorAP/Index/Mate/Dependency.pm

@@ -0,0 +1,56 @@
+package KorAP::Index::Mate::Dependency;
+use KorAP::Index::Base;
+use Data::Dumper;
+
+sub parse {
+  my $self = shift;
+
+  # TODO: Create XIP tree here - for indirect dependency
+  # >>:xip_d:SUBJ<i>566<i>789
+
+  $$self->add_tokendata(
+    foundry => 'mate',
+    layer => 'dependency',
+    cb => sub {
+      my ($stream, $token, $tokens) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash;
+
+      my $rel = $content->{rel};
+      $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+      foreach (@$rel) {
+	my $label = $_->{-label};
+
+	if ($_->{-type} && $_->{-type} eq 'unary') {
+	  next if $_->{-label} eq '--';
+	  $mtt->add(
+	    term => 'mate_d:' . $label
+	  );
+	}
+	else {
+
+	  my $from = $_->{span}->{-from};
+	  my $to   = $_->{span}->{-to};
+
+	  my $rel_token = $tokens->token($from, $to) or next;
+
+	  $mtt->add(
+	    term => '>:mate_d:' . $label,
+	    payload => '<i>' . $rel_token->pos
+	  );
+
+	  $stream->pos($rel_token->pos)->add(
+	    term => '<:mate_d:' . $label,
+	    payload => '<i>' . $token->pos
+	  );
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
new file mode 100644
index 0000000..b1a6803
--- /dev/null
+++ b/lib/KorAP/Index/Mate/Morpho.pm

@@ -0,0 +1,54 @@
+package KorAP::Index::Mate::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'mate',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f};
+
+      my $found;
+
+      my $capital = 0;
+
+      foreach my $f (@{$content->{fs}->{f}}) {
+
+	# pos
+	if (($f->{-name} eq 'pos') &&
+	      ($found = $f->{'#text'})) {
+	  $mtt->add(term => 'mate_p:' . $found
+		  );
+	}
+
+	# lemma
+	elsif (($f->{-name} eq 'lemma')
+		 && ($found = $f->{'#text'})
+		   && $found ne '--') {
+	  # b($found)->decode('latin-1')->encode->to_string
+	  $mtt->add(term => 'mate_l:' . $found);
+	}
+
+	# MSD
+	elsif (($f->{-name} eq 'msd') &&
+		 ($found = $f->{'#text'}) &&
+		   ($found ne '_')) {
+	  foreach (split '\|', $found) {
+	    my ($x, $y) = split "=", $_;
+	    # case, tense, number, mood, person, degree, gender
+	    $mtt->add(term => 'mate_m:' . $x . ':' . $y);
+	  };
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
new file mode 100644
index 0000000..20716b0
--- /dev/null
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm

@@ -0,0 +1,30 @@
+package KorAP::Index::OpenNLP::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'opennlp',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f} or return;
+
+      $content = $content->{fs}->{f};
+      my $found;
+
+      # syntax
+      if (($content->{-name} eq 'pos') && ($content->{'#text'})) {
+	$mtt->add(
+	  term => 'opennlp_p:' . $content->{'#text'}
+	);
+      };
+    }) or return;
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
new file mode 100644
index 0000000..7868fb2
--- /dev/null
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm

@@ -0,0 +1,29 @@
+package KorAP::Index::OpenNLP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+  my $i = 0;
+
+  $$self->add_spandata(
+    foundry => 'opennlp',
+    layer => 'sentences',
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  ) or return;
+
+  $$self->stream->add_meta('s', '<i>' . $i);
+
+  return 1;
+};
+
+1;

diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
new file mode 100644
index 0000000..67994ac
--- /dev/null
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm

@@ -0,0 +1,48 @@
+package KorAP::Index::TreeTagger::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'tree_tagger',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f};
+
+      my $found;
+
+      $content = ref $content ne 'ARRAY' ? [$content] : $content;
+
+      foreach my $fs (@$content) {
+	$content = $fs->{fs}->{f};
+	foreach (@$content) {
+
+	  # lemma
+	  if (($_->{-name} eq 'lemma') &&
+		($found = $_->{'#text'}) &&
+		  ($found ne 'UNKNOWN') &&
+		    ($found ne '?')) {
+	    $mtt->add(
+	      term => 'tt_l:' . $found
+	    );
+	  };
+
+	  # pos
+	  if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
+	    $mtt->add(
+	      term => 'tt_p:' . $found
+	    );
+	  };
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
new file mode 100644
index 0000000..61c2a5e
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Constituency.pm

@@ -0,0 +1,83 @@
+package KorAP::Index::XIP::Constituency;
+use KorAP::Index::Base;
+use Set::Scalar;
+use v5.16;
+
+sub parse {
+  my $self = shift;
+
+  # Collect all spans and check for roots
+  my %xip_const;
+  my $xip_const_root = Set::Scalar->new;
+  my $xip_const_noroot = Set::Scalar->new;
+
+  # First run:
+  $$self->add_spandata(
+    foundry => 'xip',
+    layer => 'constituency',
+    encoding => 'xip',
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      $xip_const{$span->id} = $span;
+      $xip_const_root->insert($span->id);
+
+      my $rel = $span->hash->{rel} or return;
+      $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+      foreach (@$rel) {
+	if ($_->{-label} eq 'dominates' && $_->{-target}) {
+	  $xip_const_noroot->insert($_->{-target});
+	};
+      };
+    }
+  ) or return;
+
+  my $stream = $$self->stream;
+
+  my $add_const = sub {
+    my $span = shift;
+    my $level = shift;
+    my $mtt = $stream->pos($span->p_start);
+
+    my $content = $span->hash;
+    my $f = $content->{fs}->{f};
+    return unless $f->{-name} eq 'const';
+
+    my $type = $f->{'#text'} or return;
+
+    # $type is now NPA, NP, NUM ...
+    my %term = (
+      term => '<>:xip_const:' . $type,
+      o_start => $span->o_start,
+      o_end => $span->o_end,
+      p_end => $span->p_end
+    );
+
+    $term{payload} = '<s>' . $level if $level;
+
+    $mtt->add(%term);
+
+    my $this = __SUB__;
+
+    my $rel = $content->{rel} or return;
+    $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+    foreach (@$rel) {
+      next if $_->{-label} ne 'dominates' || !$_->{-target};
+      my $subspan = delete $xip_const{$_->{-target}} or return;
+      $this->($subspan, $level + 1);
+    };
+  };
+
+  my $diff = $xip_const_root->difference($xip_const_noroot);
+  foreach ($diff->members) {
+    my $obj = delete $xip_const{$_} or next;
+    $add_const->($obj, 0);
+  };
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
new file mode 100644
index 0000000..ca889d8
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Dependency.pm

@@ -0,0 +1,58 @@
+package KorAP::Index::XIP::Dependency;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  # TODO: Create XIP tree here - for indirect dependency
+  # >>:xip_d:SUBJ<i>566<i>789
+
+  $$self->add_tokendata(
+    foundry => 'xip',
+    layer => 'dependency',
+    encoding => 'xip',
+    cb => sub {
+      my ($stream, $token, $tokens) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash;
+
+      my $rel = $content->{rel};
+      $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+      foreach (@$rel) {
+	my $label = $_->{-label};
+
+	if ($_->{-type} && $_->{-type} eq 'unary') {
+	  $mtt->add(
+	    term => 'xip_d:' . $label
+	  );
+	}
+	else {
+
+	  my $from = $_->{span}->{-from};
+	  my $to   = $_->{span}->{-to};
+
+	  my $rel_token = $tokens->token($from, $to) or next;
+
+	  # die $token->pos . ' -' . $label . '-> ' . $rel_token->pos;
+	  $mtt->add(
+	    term => '>:xip_d:' . $label,
+	    payload => '<i>' . $rel_token->pos
+	  );
+
+	  $stream->pos($rel_token->pos)->add(
+	    term => '<:xip_d:' . $label,
+	    payload => '<i>' . $token->pos
+	  );
+	};
+
+#	print $label,"\n";
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
new file mode 100644
index 0000000..d568afe
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Morpho.pm

@@ -0,0 +1,61 @@
+package KorAP::Index::XIP::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'xip',
+    layer => 'morpho',
+    encoding => 'xip',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f}->{fs}->{f};
+
+      my $found;
+
+      my $capital = 0;
+      foreach (@$content) {
+	# pos
+	if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
+	  $mtt->add(
+	    term => 'xip_p:' . $found
+	  );
+
+	  $capital = 1 if $found eq 'NOUN';
+	}
+      };
+
+      foreach (@$content) {
+	# lemma
+	if (($_->{-name} eq 'lemma') && ($found = $_->{'#text'})) {
+
+	  # Verb delimiter (aus=druecken)
+	  $found =~ tr/=//d;
+
+	  # Composites
+	  my (@token) = split('#', $found);
+
+	  my $full = '';
+	  foreach (@token) {
+	    $full .= $_;
+	    $_ =~ s{/\w+$}{};
+	    $mtt->add(term => 'xip_l:' . $_);
+	  };
+	  if (@token > 1) {
+	    $full =~ s{/}{}g;
+	    $full = lc $full;
+	    $full = $capital ? ucfirst($full) : $full;
+	    $mtt->add(term => 'xip_l:' . $full);
+	  };
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+
+1;

diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 7038cc4..4920215 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm

@@ -1,16 +1,18 @@
 package KorAP::Tokenizer;
-
 use Mojo::Base -base;
 use Mojo::ByteStream 'b';
-use Carp qw/carp croak/;
+use Mojo::Loader;
+use Carp qw/croak/;
 use KorAP::Tokenizer::Range;
 use KorAP::Tokenizer::Match;
 use KorAP::Tokenizer::Spans;
 use KorAP::Tokenizer::Tokens;
-use KorAP::MultiTermTokenStream;
+use KorAP::Field::MultiTermTokenStream;
+use JSON::XS;
 use Log::Log4perl;
 
-has [qw/path foundry layer doc stream should have/];
+has [qw/path foundry doc stream should have name/];
+has layer => 'Tokens';
 
 has 'log' => sub {
   Log::Log4perl->get_logger(__PACKAGE__)
@@ -21,8 +23,8 @@
   my $self = shift;
 
   # Create new token stream
-  my $mtts = KorAP::MultiTermTokenStream->new;
-  my $file = b($self->path . $self->foundry . '/' . ($self->layer // 'tokens') . '.xml')->slurp;
+  my $mtts = KorAP::Field::MultiTermTokenStream->new;
+  my $file = b($self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml')->slurp;
   my $tokens = Mojo::DOM->new($file);
   $tokens->xml(1);
 
@@ -121,13 +123,12 @@
 
   my $cb = delete $param{cb};
 
-  if ($param{encoding} && $param{encoding} eq 'bytes') {
-    $param{primary} = $self->doc->primary;
-  };
+  $param{primary} = $self->doc->primary;
 
   my $spans = KorAP::Tokenizer::Spans->new(
     path => $self->path,
     range => $self->range,
+    match => $self->match,
     %param
   );
 
@@ -140,12 +141,11 @@
     $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
   };
 
-
   if ($cb) {
     foreach (@$spanarray) {
-      $cb->($self->stream, $_);
+      $cb->($self->stream, $_, $spans);
     };
-    return;
+    return 1;
   };
   return $spans;
 };
@@ -165,12 +165,11 @@
 
   my $cb = delete $param{cb};
 
-  if ($param{encoding} && $param{encoding} eq 'bytes') {
-    $param{primary} = $self->doc->primary;
-  };
+  $param{primary} = $self->doc->primary;
 
   my $tokens = KorAP::Tokenizer::Tokens->new(
     path => $self->path,
+    range => $self->range,
     match => $self->match,
     %param
   );
@@ -189,14 +188,35 @@
 
   if ($cb) {
     foreach (@$tokenarray) {
-      $cb->($self->stream, $_);
+      $cb->($self->stream, $_, $tokens);
     };
-    return;
+    return 1;
   };
   return $tokens;
 };
 
 
+sub add {
+  my $self = shift;
+  my $loader = Mojo::Loader->new;
+  my $foundry = shift;
+  my $layer = shift;
+  my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;
+
+  if ($mod->can('new') || eval("require $mod; 1;")) {
+    if (my $retval = $mod->new($self)->parse(@_)) {
+      $self->support($foundry => $layer, @_);
+      return $retval;
+    };
+  }
+  else {
+    $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
+  };
+
+  return;
+};
+
+
 sub _perc {
   if (@_ == 2) {
     # '[' . $_[0] . '/' . $_[1] . ']' .
@@ -215,6 +235,73 @@
 };
 
 
+sub support {
+  my $self = shift;
+  unless ($_[0]) {
+    return $self->{support} // {};
+  }
+  elsif (!$_[1]) {
+    return $self->{support}->{$_[0]} // []
+  };
+  my $f = lc shift;
+  my $l = lc shift;
+  my @info = @_;
+  $self->{support} //= {};
+  $self->{support}->{$f} //= [];
+  push(@{$self->{support}->{$f}}, [$l, @info]);
+};
+
+
+sub to_string {
+  my $self = shift;
+  my $primary = defined $_[0] ? $_[0] : 1;
+  my $string = "<meta>\n";
+  $string .= $self->doc->to_string;
+  $string .= "</meta>\n";
+  if ($primary) {
+    $string .= "<text>\n";
+    $string .= $self->doc->primary->data . "\n";
+    $string .= "</text>\n";
+  };
+  $string .= '<field name="' . $self->name . "\">\n";
+  $string .= "<info>\n";
+  $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";
+  foreach my $foundry (keys %{$self->support}) {
+    foreach (@{$self->support($foundry)}) {
+      $string .= 'support = ' . $foundry . '#' . join(',', @{$_}) . "\n";
+    };
+  };
+  $string .= "</info>\n";
+  $string .= $self->stream->to_string;
+  $string .= "</field>";
+  return $string;
+};
+
+sub to_data {
+  my $self = shift;
+  my $primary = defined $_[0] ? $_[0] : 1;
+  my %data;
+  $data{meta} = $self->doc->to_hash;
+  $data{primary} = $self->doc->primary->data if $primary;
+  $data{fields} = [ {
+    name => $self->name,
+    data => $self->stream->to_array,
+    tokenization => [lc($self->foundry), lc($self->layer)],
+    support => $self->support
+  }];
+  \%data;
+};
+
+sub to_json {
+  encode_json($_[0]->to_data($_[1]));
+};
+
+
+sub to_pretty_json {
+  JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
+};
+
+
 1;
 
 
@@ -276,7 +363,7 @@
 
   $tokens->stream->add_meta('adjCount', '<i>45');
 
-The L<KorAP::MultiTermTokenStream> object
+The L<KorAP::Field::MultiTermTokenStream> object
 
 
 =head2 range
@@ -321,7 +408,7 @@
 Add span information to the parsed token stream.
 Expects a C<foundry> name, a C<layer> name and a
 callback parameter, that will be called after each parsed
-span. The L<KorAP::MultiTermTokenStream> object will be passed,
+span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
 as well as the current L<KorAP::Tokenizer::Span>.
 
 An optional parameter C<encoding> may indicate that the span offsets
@@ -351,7 +438,7 @@
 Add token information to the parsed token stream.
 Expects a C<foundry> name, a C<layer> name and a
 callback parameter, that will be called after each parsed
-token. The L<KorAP::MultiTermTokenStream> object will be passed,
+token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
 as well as the current L<KorAP::Tokenizer::Span>.
 
 An optional parameter C<encoding> may indicate that the token offsets

diff --git a/lib/KorAP/Tokenizer/Match.pm b/lib/KorAP/Tokenizer/Match.pm
index e5a96f8..2a06aea 100644
--- a/lib/KorAP/Tokenizer/Match.pm
+++ b/lib/KorAP/Tokenizer/Match.pm

@@ -8,10 +8,21 @@
 
 sub set {
   $_[0]->{$_[1] . ':' . $_[2]} = $_[3];
+  $_[0]->{'[' . $_[1]} = $_[3];
+  $_[0]->{$_[2] . ']'} = $_[3];
 };
 
 sub lookup {
   $_[0]->{$_[1] . ':' . $_[2]} // undef;
 };
 
+sub startswith {
+  $_[0]->{'[' . $_[1]} // undef;
+};
+
+sub endswith {
+  $_[0]->{$_[1] . ']'} // undef;
+};
+
+
 1;

diff --git a/lib/KorAP/Tokenizer/Range.pm b/lib/KorAP/Tokenizer/Range.pm
index c18136b..110fbc6 100644
--- a/lib/KorAP/Tokenizer/Range.pm
+++ b/lib/KorAP/Tokenizer/Range.pm

@@ -35,9 +35,9 @@
 
   if ($found =~ /!(\d+):(\d+)$/) {
     return $1 >= 0 ? $1 : 0;
-  }
-  else {
-    return $found;
+#  }
+#  else {
+#    return $found;
   };
 };
 
@@ -46,9 +46,9 @@
   my $found = $$self->lookup( shift() );
   if ($found =~ /!(\d+):(\d+)$/) {
     return $2;
-  }
-  else {
-    return $found;
+#  }
+#  else {
+#    return $found;
   };
 };
 

diff --git a/lib/KorAP/Tokenizer/Span.pm b/lib/KorAP/Tokenizer/Span.pm
index 0010d7c..485e700 100644
--- a/lib/KorAP/Tokenizer/Span.pm
+++ b/lib/KorAP/Tokenizer/Span.pm

@@ -42,29 +42,33 @@
   $_[0]->[4];
 };
 
-
 sub content {
   if (defined $_[1]) {
     $_[0]->[5] = $_[1];
   }
   else {
-    if ($_[0]->processed) {
-      return $_[0]->[5];
-    }
-    else {
-      my $c = Mojo::DOM->new($_[0]->[5]);
-      $c->xml(1);
-      $_[0]->processed(1);
-      return $_[0]->[5] = $c;
-    };
+    return $_[0]->[5];
   };
 };
 
-sub processed {
-  if (defined $_[1]) {
-    $_[0]->[6] = $_[1] ? 1 : 0;
+sub dom {
+  if ($_[0]->[6]) {
+    return $_[0]->[6];
+  }
+  else {
+    my $c = Mojo::DOM->new($_[0]->[5]);
+    $c->xml(1);
+    return $_[0]->[6] = $c;
   };
-  $_[0]->[6];
+};
+
+sub hash {
+  if (defined $_[1]) {
+    $_[0]->[7] = $_[1];
+  }
+  else {
+    return $_[0]->[7];
+  };
 };
 
 1;

diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 7e1a382..7eb31c0 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm

@@ -1,58 +1,43 @@
 package KorAP::Tokenizer::Spans;
-use Mojo::Base -base;
+use Mojo::Base 'KorAP::Tokenizer::Units';
 use KorAP::Tokenizer::Span;
 use Mojo::DOM;
 use Mojo::ByteStream 'b';
+use XML::Fast;
 
-has [qw/path foundry layer range primary should have/];
-has 'encoding' => 'utf-8';
+has 'range';
 
 sub parse {
   my $self = shift;
   my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
 
-  my $spans = Mojo::DOM->new($file);
-  $spans->xml(1);
+  # my $spans = Mojo::DOM->new($file);
+  # $spans->xml(1);
+
+  # my $spans = XML::LibXML->load_xml(string => $file);
+
+  my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
 
   my ($should, $have) = (0,0);
-  my ($from, $to);
+  my ($from, $to, $h);
 
   my @spans;
-  $spans->find('span')->each(
-    sub {
-      my $s = shift;
+  my $p = $self->primary;
 
-      $should++;
+  foreach my $s (@$spans) {
 
-      if ($self->encoding eq 'bytes') {
-	$from = $self->primary->bytes2chars($s->attr('from'));
-	$to = $self->primary->bytes2chars($s->attr('to'));
-      }
-      else {
-	$from = $s->attr('from');
-	$to = $s->attr('to');
-      };
+    $should++;
 
-      return unless $to > $from;
+    my $span = $self->span(
+      $s->{-from},
+      $s->{-to},
+      $s
+    ) or next;
 
-      my $span = KorAP::Tokenizer::Span->new;
+    $have++;
 
-      $span->id($s->attr('id'));
-      $span->o_start($from);
-      $span->o_end($to);
-      $span->p_start($self->range->after($span->o_start));
-      $span->p_end($self->range->before($span->o_end));
-
-      return unless $span->p_end >= $span->p_start;
-
-      if (@{$s->children}) {
-	$span->content($s->content_xml);
-      };
-
-      $have++;
-
-      push(@spans, $span);
-    });
+    push(@spans, $span);
+  };
 
   $self->should($should);
   $self->have($have);

diff --git a/lib/KorAP/Tokenizer/Token.pm b/lib/KorAP/Tokenizer/Token.pm
index f6c1971..fe12a25 100644
--- a/lib/KorAP/Tokenizer/Token.pm
+++ b/lib/KorAP/Tokenizer/Token.pm

@@ -14,14 +14,13 @@
   $_[0]->[0];
 };
 
+
 sub content {
-  if ($_[1]) {
+  if (defined $_[1]) {
     $_[0]->[1] = $_[1];
   }
   else {
-    my $c = Mojo::DOM->new($_[0]->[1]);
-    $c->xml(1);
-    return $c;
+    return $_[0]->[1];
   };
 };
 
@@ -34,4 +33,25 @@
   };
 };
 
+sub dom {
+  if ($_[0]->[3]) {
+    return $_[0]->[3];
+  }
+  else {
+    my $c = Mojo::DOM->new($_[0]->[1]);
+    $c->xml(1);
+    return $_[0]->[3] = $c;
+  };
+};
+
+sub hash {
+  if (defined $_[1]) {
+    $_[0]->[4] = $_[1];
+  }
+  else {
+    return $_[0]->[4];
+  };
+};
+
+
 1;

diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
index 7657460..58ff6fa 100644
--- a/lib/KorAP/Tokenizer/Tokens.pm
+++ b/lib/KorAP/Tokenizer/Tokens.pm

@@ -1,56 +1,38 @@
 package KorAP::Tokenizer::Tokens;
-use Mojo::Base -base;
+use Mojo::Base 'KorAP::Tokenizer::Units';
 use Mojo::DOM;
 use Mojo::ByteStream 'b';
 use KorAP::Tokenizer::Token;
+use Carp qw/croak carp/;
+use XML::Fast;
 
-has [qw/path foundry layer match primary should have/];
-has 'encoding' => 'utf-8';
 
 sub parse {
   my $self = shift;
   my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
 
-  my $spans = Mojo::DOM->new($file);
-  $spans->xml(1);
+#  my $spans = Mojo::DOM->new($file);
+#  $spans->xml(1);
+  my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
 
   my ($should, $have) = (0,0);
-  my ($from, $to);
-
-  my $match = $self->match;
 
   my @tokens;
-  $spans->find('span')->each(
-    sub {
-      my $s = shift;
 
-      $should++;
+  foreach my $s (@$spans) {
 
-      if ($self->encoding eq 'bytes') {
-	$from = $self->primary->bytes2chars($s->attr('from'));
-	$to = $self->primary->bytes2chars($s->attr('to'));
-      }
-      else {
-	$from = $s->attr('from');
-	$to = $s->attr('to');
-      };
+    $should++;
 
-      my $pos = $match->lookup($from, $to);
+    my $token = $self->token(
+      $s->{-from},
+      $s->{-to},
+      $s
+    ) or next;
 
-      return unless defined $pos;
+    $have++;
 
-      my $token = KorAP::Tokenizer::Token->new;
-      $token->id($s->attr('id'));
-      $token->pos($pos);
-
-      if (@{$s->children}) {
-	$token->content($s->content_xml);
-      };
-
-      $have++;
-
-      push(@tokens, $token);
-    });
+    push(@tokens, $token);
+  };
 
   $self->should($should);
   $self->have($have);

diff --git a/lib/KorAP/Tokenizer/Units.pm b/lib/KorAP/Tokenizer/Units.pm
new file mode 100644
index 0000000..c04ebaf
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Units.pm

@@ -0,0 +1,89 @@
+package KorAP::Tokenizer::Units;
+use KorAP::Tokenizer::Span;
+use KorAP::Tokenizer::Token;
+use Mojo::Base -base;
+
+has [qw/path foundry layer match range primary should have/];
+has 'encoding' => 'utf-8';
+
+sub span {
+  my $self = shift;
+  my ($from, $to, $s) = @_;
+
+  ($from, $to) = $self->_offset($from, $to);
+
+  return unless $to > $from;
+
+  my $span = KorAP::Tokenizer::Span->new;
+
+  $span->id($s->{-id}) if $s && $s->{-id};
+
+  $span->o_start($from);
+  $span->o_end($to);
+
+  my $start = $self->match->startswith($span->o_start);
+
+  unless (defined $start) {
+    $start = $self->range->after($span->o_start) or return;
+  };
+
+  $span->p_start($start);
+
+  my $end = $self->match->endswith($span->o_end);
+  unless ($end) {
+    $end = $self->range->before($span->o_end) or return;
+  };
+  $span->p_end($end);
+
+  return unless $span->p_end >= $span->p_start;
+
+  $span->hash($s) if $s;
+
+  return $span;
+};
+
+sub token {
+  my $self = shift;
+  my ($from, $to, $s) = @_;
+
+  ($from, $to) = $self->_offset($from, $to);
+
+  my $pos = $self->match->lookup($from, $to);
+
+  return unless defined $pos;
+
+#      if ($from == $to) {
+#	print "Unable to find match for $from - $to (resp ".$s->{-from} . '-' . $s->{-to}.") " . $s->{-id};
+#	print "\n";
+#      };
+
+  my $token = KorAP::Tokenizer::Token->new;
+  $token->id($s->{-id}) if $s && $s->{-id};
+  $token->pos($pos);
+
+  $token->hash($s) if $s;
+
+  return $token;
+};
+
+
+sub _offset {
+  my $self = shift;
+  my ($from, $to) = @_;
+
+  if ($self->encoding) {
+    my $p = $self->primary;
+    if ($self->encoding eq 'bytes') {
+      $from = $p->bytes2chars($from);
+      $to = $p->bytes2chars($to);
+    }
+    elsif ($self->encoding eq 'xip') {
+      $from = $p->xip2chars($from);
+      $to = $p->xip2chars($to);
+    };
+  };
+
+  return ($from, $to);
+};
+
+1;

diff --git a/script/log4perl.conf b/script/log4perl.conf
deleted file mode 100644
index 90459dd..0000000
--- a/script/log4perl.conf
+++ /dev/null

@@ -1,5 +0,0 @@
-log4perl.rootLogger=TRACE, STDERR
-
-log4perl.appender.STDERR. = Log::Log4perl::Appender::ScreenColoredLevels
-log4perl.appender.STDERR.layout=PatternLayout
-log4perl.appender.STDERR.layout.ConversionPattern=[%r] %F %L %c - %m%n
\ No newline at end of file

diff --git a/script/prepare_index.pl b/script/prepare_index.pl
index f0dc398..2b98cfa 100644
--- a/script/prepare_index.pl
+++ b/script/prepare_index.pl

@@ -1,405 +1,195 @@
 #!/usr/bin/env perl
 use strict;
 use warnings;
-use v5.16;
 use lib 'lib', '../lib';
-use Set::Scalar;
-use Mojo::DOM;
-use Mojo::Util qw/encode decode/;
-use Mojo::ByteStream 'b';
-
+use Getopt::Long;
+use Benchmark qw/:hireswallclock/;
+use IO::Compress::Gzip qw/$GzipError/;
 use Log::Log4perl;
-Log::Log4perl->init("script/log4perl.conf");
-
 use KorAP::Document;
 use KorAP::Tokenizer;
 
+our $VERSION = 0.01;
+
+# Merges foundry data to create indexer friendly documents
+# ndiewald, 2013/11/05
+
+sub printhelp {
+  print <<'EOHELP';
+
+Merge foundry data based on a tokenization and create indexer friendly documents.
+
+Call:
+prepare_index.pl -z --input <directory> --outputfile <filename>
+
+--input|-i <directory>          Directory of the document to index
+--output|-o <filename>          Document name for output (optional),
+                                Writes to <STDOUT> by default
+--token|-t <foundry>[#<layer>]  Define the default tokenization by specifying
+                                the name of the foundry and optionally the name
+                                of the layer. Defaults to OpenNLP#tokens.
+--skip|-s <foundry>[#<layer>]   Skip specific foundries by specifying the name
+                                or specific layers by defining the name
+                                with a # in front of the foundry,
+                                e.g. Mate#Morpho. Alternatively you can skip #ALL.
+                                Can be set multiple times.
+--allow|-a <foundry>#<layer>    Allow specific foundries and layers by defining them
+                                combining the foundry name with a # and the layer name.
+--primary|-p                    Output primary data or not. Defaults to true.
+                                Can be flagged using --no-primary as well.
+--human|-m                      Represent the data human friendly,
+                                while the output defaults to JSON
+--pretty|-y                     Pretty print json output
+--gzip|-z                       Compress the output
+                                (expects a defined output file)
+--log|-l                        The Log4perl log level, defaults to ERROR.
+--help|-h                       Print this document (optional)
+
+diewald@ids-mannheim.de, 2013/11/04
+
+EOHELP
+  exit(defined $_[0] ? $_[0] : 0);
+};
+
+# Options from the command line
+my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty);
+GetOptions(
+  'input|x=s'   => \$input,
+  'output|o=s'  => \$output,
+  'human|m'     => \$text,
+  'token|t=s'   => \$token_base,
+  'gzip|z'      => \$gzip,
+  'skip|s=s'    => \@skip,
+  'log|l=s'     => \$log_level,
+  'allow|a=s'   => \@allow,
+  'primary|p!'  => \$primary,
+  'pretty|y'    => \$pretty,
+  'help|h'      => sub { printhelp }
+);
+
+printhelp(1) if !$input || ($gzip && !$output);
+
+$log_level //= 'ERROR';
+
+my %skip;
+$skip{lc($_)} = 1 foreach @skip;
+
+Log::Log4perl->init({
+  'log4perl.rootLogger' => uc($log_level) . ', STDERR',
+  'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+  'log4perl.appender.STDERR.layout' => 'PatternLayout',
+  'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+});
+
+my $log = Log::Log4perl->get_logger('main');
+
+BEGIN {
+  $main::TIME = Benchmark->new;
+  $main::LAST_STOP = Benchmark->new;
+};
+
+sub stop_time {
+  my $new = Benchmark->new;
+  $log->trace(
+    'The code took: '.
+      timestr(timediff($new, $main::LAST_STOP)) .
+	' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
+      );
+  $main::LAST_STOP = $new;
+};
 
 # Call perl script/prepare_index.pl WPD/AAA/00001
 
-sub parse_doc {
-  my $doc = KorAP::Document->new(
-    path => shift . '/'
-  );
-
-  $doc->parse;
-
-  my $tokens = KorAP::Tokenizer->new(
-    path => $doc->path,
-    doc => $doc,
-    foundry => 'connexor',
-    layer => 'tokens'
-  );
-
-  $tokens->parse;
-
-  my $i = 0;
-  $tokens->add_spandata(
-    foundry => 'connexor',
-    layer => 'sentences',
-    #skip => 1,
-    cb => sub {
-      my ($stream, $span) = @_;
-      my $mtt = $stream->pos($span->p_start);
-      $mtt->add(
-	term => '<>:s',
-	o_start => $span->o_start,
-	o_end => $span->o_end,
-	p_end => $span->p_end
-      );
-      $i++;
-    }
-  );
-
-  $tokens->stream->add_meta('s', '<i>' . $i);
-
-  $i = 0;
-  $tokens->add_spandata(
-    foundry => 'base',
-    layer => 'paragraph',
-    #skip => 1,
-    cb => sub {
-      my ($stream, $span) = @_;
-      my $mtt = $stream->pos($span->p_start);
-      $mtt->add(
-	term => '<>:p',
-	o_start => $span->o_start,
-	o_end => $span->o_end,
-	p_end => $span->p_end
-      );
-      $i++;
-    }
-  );
-  $tokens->stream->add_meta('p', '<i>' . $i);
-
-  $tokens->add_tokendata(
-    foundry => 'opennlp',
-    layer => 'morpho',
-    #skip => 1,
-    cb => sub {
-      my ($stream, $token) = @_;
-      my $mtt = $stream->pos($token->pos);
-      my $content = $token->content;
-
-      my $found;
-
-      # syntax
-      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
-	$mtt->add(
-	  term => 'opennlp_p:' . $found
-	);
-      };
-    });
+# Create and parse new document
+$input =~ s{([^/])$}{$1/};
+my $doc = KorAP::Document->new( path => $input );
+$doc->parse;
 
 
-  my $model = 'ne_dewac_175m_600';
-  $tokens->add_tokendata(
-    foundry => 'corenlp',
-    #skip => 1,
-    layer => $model,
-    cb => sub {
-      my ($stream, $token) = @_;
-      my $mtt = $stream->pos($token->pos);
-      my $content = $token->content;
+my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
+if ($token_base) {
+  ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
+};
 
-      my $found;
+# Get tokenization
+my $tokens = KorAP::Tokenizer->new(
+  path => $doc->path,
+  doc => $doc,
+  foundry => $token_base_foundry,
+  layer => $token_base_layer,
+  name => 'tokens'
+);
+$tokens->parse;
 
-      if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
-	$mtt->add(
-	  term => 'corenlp_' . $model . ':' . $found
-	);
-      };
-    });
+my @layers;
 
-  $model = 'ne_hgc_175m_600';
-  $tokens->add_tokendata(
-    foundry => 'corenlp',
-    #skip => 1,
-    layer => $model,
-    cb => sub {
-      my ($stream, $token) = @_;
-      my $mtt = $stream->pos($token->pos);
-      my $content = $token->content;
+# Base information
+push(@layers, ['OpenNLP', 'Sentences']);
+push(@layers, ['Base', 'Paragraphs']);
 
-      my $found;
+# OpenNLP
+push(@layers, ['OpenNLP', 'Morpho']);
 
-      if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
-	$mtt->add(
-	  term => 'corenlp_' . $model . ':' . $found
-	);
-      };
-    });
+# CoreNLP
+push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']);
+push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']);
 
-  $tokens->add_tokendata(
-    foundry => 'connexor',
-    layer => 'morpho',
-    #skip => 1,
-    cb => sub {
-      my ($stream, $token) = @_;
-      my $mtt = $stream->pos($token->pos);
-      my $content = $token->content;
+# Connexor
+push(@layers, ['Connexor', 'Morpho']);
+push(@layers, ['Connexor', 'Syntax']);
+push(@layers, ['Connexor', 'Phrase']);
 
-      my $found;
+# TreeTagger
+push(@layers, ['TreeTagger', 'Morpho']);
 
-      # Lemma
-      if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
-	if (index($found, "\N{U+00a0}") >= 0) {
-	  $found = b($found)->decode;
-	  foreach (split(/\x{00A0}/, $found)) {
-	    $mtt->add(
-	      term => 'cnx_l:' . b($_)->encode
-	    );
-	  }
-	}
-	else {
-	  $mtt->add(
-	    term => 'cnx_l:' . $found # b($found)->encode
-	  );
-	};
-      };
+# Mate
+push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
 
-      # POS
-      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
-	$mtt->add(
-	  term => 'cnx_p:' . $found
-	);
-      };
-
-      # MSD
-      # Todo: Look in the description!
-      if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
-	foreach (split(':', $found)) {
-	  $mtt->add(
-	    term => 'cnx_m:' . $_
-	  );
-	};
-      };
-    }
-  );
-
-  $tokens->add_tokendata(
-    foundry => 'connexor',
-    layer => 'syntax',
-    #skip => 1,
-    cb => sub {
-      my ($stream, $token) = @_;
-      my $mtt = $stream->pos($token->pos);
-      my $content = $token->content;
-
-      my $found;
-
-      # syntax
-      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
-	$mtt->add(
-	  term => 'cnx_syn:' . $found
-	);
-      };
-    });
-
-  $tokens->add_spandata(
-    foundry => 'connexor',
-    layer => 'phrase',
-    #skip => 1,
-    cb => sub {
-      my ($stream, $span) = @_;
-
-      my $type = $span->content->at('f[name=pos]');
-      if ($type && ($type = $type->text)) {
-	my $mtt = $stream->pos($span->p_start);
-	$mtt->add(
-	  term => '<>:cnx_const:' . $type,
-	  o_start => $span->o_start,
-	  o_end => $span->o_end,
-	  p_end => $span->p_end
-	);
-      };
-    }
-  );
-
-  $tokens->add_tokendata(
-    foundry => 'tree_tagger',
-    #skip => 1,
-    layer => 'morpho',
-    cb => sub {
-      my ($stream, $token) = @_;
-      my $mtt = $stream->pos($token->pos);
-      my $content = $token->content;
-
-      my $found;
-
-      # lemma
-      if (($found = $content->at('f[name="lemma"]')) &&
-	    ($found = $found->text) && $found ne 'UNKNOWN') {
-	$mtt->add(
-	  term => 'tt_l:' . $found
-	);
-      };
-
-      # pos
-      if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
-	$mtt->add(
-	  term => 'tt_p:' . $found
-	);
-      };
-    });
-
-  $tokens->add_tokendata(
-    foundry => 'mate',
-    layer => 'morpho',
-    cb => sub {
-      my ($stream, $token) = @_;
-      my $mtt = $stream->pos($token->pos);
-      my $content = $token->content;
-
-      my $found;
-
-      my $capital = 0;
-
-      # pos
-      if (($found = $content->at('f[name="pos"]')) &&
-	    ($found = $found->text)) {
-	$mtt->add(term => 'mate_p:' . $found
-	);
-      };
-
-      # lemma
-      if (($found = $content->at('f[name="lemma"]'))
-	    && ($found = $found->text)
-	      && $found ne '--') {
-	$mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
-      };
-
-      # MSD
-      if (($found = $content->at('f[name="msd"]')) &&
-	    ($found = $found->text) &&
-	      ($found ne '_')) {
-	foreach (split '\|', $found) {
-	  my ($x, $y) = split "=", $_;
-	  # case, tense, number, mood, person, degree, gender
-	  $mtt->add(term => 'mate_m:' . $x . ':' . $y);
-	};
-      };
-    });
+# XIP
+push(@layers, ['XIP', 'Morpho']);
+push(@layers, ['XIP', 'Constituency']);
+push(@layers, ['XIP', 'Dependency']);
 
 
-  $tokens->add_tokendata(
-    foundry => 'xip',
-    #skip => 1,
-    layer => 'morpho',
-    encoding => 'bytes',
-    cb => sub {
-      my ($stream, $token) = @_;
-      my $mtt = $stream->pos($token->pos);
-      my $content = $token->content;
-
-      my $found;
-
-      my $capital = 0;
-      # pos
-      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
-	$mtt->add(
-	  term => 'xip_p:' . $found
-	);
-
-	$capital = 1 if $found eq 'NOUN';
-      };
-
-      # lemma
-      if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
-	my (@token) = split('#', $found);
-
-	my $full = '';
-	foreach (@token) {
-	  $full .= $_;
-	  $_ =~ s{/\w+$}{};
-	  $mtt->add(term => 'xip_l:' . $_);
-	};
-	if (@token > 1) {
-	  $full =~ s{/}{}g;
-	  $full = lc $full;
-	  $full = $capital ? ucfirst($full) : $full;
-	  $mtt->add(term => 'xip_l:' . $full);
-	};
-      };
-    });
-
-
-  # Collect all spans and check for roots
-  my %xip_const;
-  my $xip_const_root = Set::Scalar->new;
-  my $xip_const_noroot = Set::Scalar->new;
-
-  # First run:
-  $tokens->add_spandata(
-    foundry => 'xip',
-    layer => 'constituency',
-    encoding => 'bytes',
-    #skip => 1,
-    cb => sub {
-      my ($stream, $span) = @_;
-
-      $xip_const{$span->id} = $span;
-      $xip_const_root->insert($span->id);
-
-      $span->content->find('rel[label=dominates][target]')->each(
-	sub {
-	  my $rel = shift;
-	  $xip_const_noroot->insert($rel->attr('target'));
-	}
-      );
-    }
-  );
-
-  my $stream = $tokens->stream;
-
-  my $add_const = sub {
-    my $span = shift;
-    my $level = shift;
-    my $mtt = $stream->pos($span->p_start);
-
-    my $content = $span->content;
-    my $type = $content->at('f[name=const]');
-    if ($type && ($type = $type->text)) {
-      # $type is now NPA, NP, NUM
-      my %term = (
-	term => '<>:xip_const:' . $type,
-	o_start => $span->o_start,
-	o_end => $span->o_end,
-	p_end => $span->p_end
-      );
-
-      $term{payload} = '<s>' . $level if $level;
-
-      $mtt->add(%term);
-
-      my $this = __SUB__;
-
-      $content->find('rel[label=dominates][target]')->each(
-	sub {
-	  my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
-	  $this->($subspan, $level + 1);
-	}
-      );
+if ($skip{'#all'}) {
+  foreach (@allow) {
+    $tokens->add(split('#', $_));
+    stop_time;
+  };
+}
+else {
+  # Add to index file - respect skipping
+  foreach my $info (@layers) {
+    unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) {
+      $tokens->add(@$info);
+      stop_time;
     };
   };
+};
 
-  my $diff = $xip_const_root->difference($xip_const_noroot);
-  foreach ($diff->members) {
-    my $obj = delete $xip_const{$_} or next;
-    $add_const->($obj, 0);
+my $file;
+
+my $print_text = $text ? $tokens->to_string($primary) : ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
+
+if ($output) {
+  if ($gzip) {
+    $file = IO::Compress::Gzip->new($output, Minimal => 1);
+  }
+  else {
+    $file = IO::File->new($output, "w");
   };
 
-  # Todo: Add mate-morpho
-  # Todo: Add mate-dependency
-  # Todo: Add xip-dependency
+#  binmode $file, ':utf8';
 
-  print $tokens->stream->to_string;
+  $file->print($print_text);
+  $file->close;
+}
+else {
+#  binmode STDOUT, ':utf8';
+  print $print_text . "\n";
 };
 
-if ($ARGV[0]) {
-  parse_doc($ARGV[0]);
-};
-
-
+stop_time;
 
 __END__

diff --git a/t/primary.t b/t/primary.t
new file mode 100644
index 0000000..831124e
--- /dev/null
+++ b/t/primary.t

@@ -0,0 +1,88 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use Mojo::ByteStream 'b';
+use utf8;
+use lib 'lib', '../lib';
+
+use_ok('KorAP::Document::Primary');
+
+my $t = "Der März ging vorüber und demnächst würde es Herbstblätter regnen.";
+
+ok(my $p = KorAP::Document::Primary->new($t), 'Constructor');
+
+is($p->data_length, 66, 'Text has correct length');
+
+is($p->data, $t, 'Text is identical');
+is($p->data(0,3), 'Der', 'Text is identical');
+is($p->data(4,8), 'März', 'Text is identical');
+is($p->data(26,35), 'demnächst', 'Text is identical');
+
+is($p->data_bytes(0,3), 'Der', 'Text is identical');
+is($p->data_bytes(4,9), 'März', 'Text is identical');
+is($p->data_bytes(28,38), 'demnächst', 'Text is identical');
+
+is($p->bytes2chars(4), 4, 'Byte offset matches');
+is($p->bytes2chars(9), 8, 'Byte offset matches');
+is($p->bytes2chars(28), 26, 'Byte offset matches');
+is($p->bytes2chars(38), 35, 'Byte offset matches');
+
+is(
+  $p->data(
+    $p->bytes2chars(17),
+    $p->bytes2chars(45)
+  ),
+  $p->data_bytes(17,45),
+  'Text is identical'
+);
+
+$t = 'Er dächte, daß dies „für alle Elemente gilt“.';
+
+ok($p = KorAP::Document::Primary->new($t), 'Constructor');
+
+is($p->data_length, 45, 'Text has correct length');
+
+is($p->data, $t, 'Text is identical');
+is($p->data(0,2), 'Er', 'Text is identical');
+is($p->data(3,9), 'dächte', 'Text is identical');
+is($p->data(21,24), 'für', 'Text is identical');
+is($p->data(20,21), '„', 'Text is identical');
+is($p->data(43,44), '“', 'Text is identical');
+is($p->data(44,45), '.', 'Text is identical');
+
+is($p->data_bytes(0,2), 'Er', 'Text is identical');
+is($p->bytes2chars(0),0, 'b2c correct');
+is($p->bytes2chars(2),2, 'b2c correct');
+is($p->data_bytes(3,10), 'dächte', 'Text is identical');
+is($p->bytes2chars(3),3, 'b2c correct');
+is($p->bytes2chars(10),9, 'b2c correct');
+is($p->data_bytes(25,29), 'für', 'Text is identical');
+is($p->bytes2chars(25),21, 'b2c correct');
+is($p->bytes2chars(29),24, 'b2c correct');
+is($p->data_bytes(22,25), '„', 'Text is identical');
+is($p->bytes2chars(22),20, 'b2c correct');
+is($p->bytes2chars(25),21, 'b2c correct');
+is($p->data_bytes(48,51), '“', 'Text is identical');
+is($p->bytes2chars(48),43, 'b2c correct');
+is($p->bytes2chars(51),44, 'b2c correct');
+is($p->data_bytes(51,52), '.', 'Text is identical');
+is($p->bytes2chars(52),45, 'b2c correct');
+
+is(
+  $p->data(
+    $p->bytes2chars(17),
+    $p->bytes2chars(45)
+  ),
+  $p->data_bytes(17,45),
+  'Text is identical'
+);
+
+
+#ok($p = KorAP::Document::Primary->new($t), 'Constructor');
+is($p->xip2chars(0), 0, 'Fine');
+is($p->xip2chars(7), 6, 'Fine');
+#diag $p->data($p->latinbytes2chars(3),$p->latinbytes2chars(9));
+
+
+done_testing;
commit	7364d1f29b898a41bb2074a34ce671f7c36ca5de	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Tue Nov 05 19:26:35 2013 +0000
committer	Nils Diewald <nils@diewald-online.de>	Tue Nov 05 19:26:35 2013 +0000
tree	94575601cacd010be2d075a2b8a6e1926fdb60f1
parent	2db9ad0d484214d641313937810225a635a997a4 [diff]